howard.objects.variants
1import csv 2import gc 3import gzip 4import io 5import multiprocessing 6import os 7import random 8import re 9import shlex 10import sqlite3 11import subprocess 12from tempfile import NamedTemporaryFile, TemporaryDirectory 13import tempfile 14import duckdb 15import json 16import yaml 17import argparse 18import Bio.bgzf as bgzf 19import pandas as pd 20from pyfaidx import Fasta 21import numpy as np 22import vcf 23import logging as log 24import fastparquet as fp 25from multiprocesspandas import applyparallel 26import cyvcf2 27import pyBigWig 28 29from howard.functions.commons import * 30from howard.objects.database import * 31from howard.functions.databases import * 32from howard.functions.utils import * 33 34 35class Variants: 36 37 def __init__( 38 self, 39 conn=None, 40 input: str = None, 41 output: str = None, 42 config: dict = {}, 43 param: dict = {}, 44 load: bool = False, 45 ) -> None: 46 """ 47 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 48 header 49 50 :param conn: the connection to the database 51 :param input: the input file 52 :param output: the output file 53 :param config: a dictionary containing the configuration of the model 54 :param param: a dictionary containing the parameters of the model 55 """ 56 57 # Init variables 58 self.init_variables() 59 60 # Input 61 self.set_input(input) 62 63 # Config 64 self.set_config(config) 65 66 # Param 67 self.set_param(param) 68 69 # Output 70 self.set_output(output) 71 72 # connexion 73 self.set_connexion(conn) 74 75 # Header 76 self.set_header() 77 78 # Samples 79 self.set_samples() 80 81 # Load data 82 if load: 83 self.load_data() 84 85 def set_samples(self, samples: list = None) -> list: 86 """ 87 The function `set_samples` sets the samples attribute of an object to a provided list or 88 retrieves it from a parameter dictionary. 89 90 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 91 input and sets the `samples` attribute of the class to the provided list. If no samples are 92 provided, it tries to get the samples from the class's parameters using the `get_param` method 93 :type samples: list 94 :return: The `samples` list is being returned. 95 """ 96 97 if not samples: 98 samples = self.get_param().get("samples", {}).get("list", None) 99 100 self.samples = samples 101 102 return samples 103 104 def get_samples(self) -> list: 105 """ 106 This function returns a list of samples. 107 :return: The `get_samples` method is returning the `samples` attribute of the object. 108 """ 109 110 return self.samples 111 112 def get_samples_check(self) -> bool: 113 """ 114 This function returns the value of the "check" key within the "samples" dictionary retrieved 115 from the parameters. 116 :return: The method `get_samples_check` is returning the value of the key "check" inside the 117 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 118 method. If the key "check" is not found, it will return `False`. 119 """ 120 121 return self.get_param().get("samples", {}).get("check", True) 122 123 def set_input(self, input: str = None) -> None: 124 """ 125 The function `set_input` takes a file name as input, extracts the name and extension, and sets 126 attributes in the class accordingly. 127 128 :param input: The `set_input` method in the provided code snippet is used to set attributes 129 related to the input file. Here's a breakdown of the parameters and their usage in the method: 130 :type input: str 131 """ 132 133 if input and not isinstance(input, str): 134 try: 135 self.input = input.name 136 except: 137 log.error(f"Input file '{input} in bad format") 138 raise ValueError(f"Input file '{input} in bad format") 139 else: 140 self.input = input 141 142 # Input format 143 if input: 144 input_name, input_extension = os.path.splitext(self.input) 145 self.input_name = input_name 146 self.input_extension = input_extension 147 self.input_format = self.input_extension.replace(".", "") 148 149 def set_config(self, config: dict) -> None: 150 """ 151 The set_config function takes a config object and assigns it as the configuration object for the 152 class. 153 154 :param config: The `config` parameter in the `set_config` function is a dictionary object that 155 contains configuration settings for the class. When you call the `set_config` function with a 156 dictionary object as the argument, it will set that dictionary as the configuration object for 157 the class 158 :type config: dict 159 """ 160 161 self.config = config 162 163 def set_param(self, param: dict) -> None: 164 """ 165 This function sets a parameter object for the class based on the input dictionary. 166 167 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 168 as the `param` attribute of the class instance 169 :type param: dict 170 """ 171 172 self.param = param 173 174 def init_variables(self) -> None: 175 """ 176 This function initializes the variables that will be used in the rest of the class 177 """ 178 179 self.prefix = "howard" 180 self.table_variants = "variants" 181 self.dataframe = None 182 183 self.comparison_map = { 184 "gt": ">", 185 "gte": ">=", 186 "lt": "<", 187 "lte": "<=", 188 "equals": "=", 189 "contains": "SIMILAR TO", 190 } 191 192 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 193 194 self.code_type_map_to_sql = { 195 "Integer": "INTEGER", 196 "String": "VARCHAR", 197 "Float": "FLOAT", 198 "Flag": "VARCHAR", 199 } 200 201 self.index_additionnal_fields = [] 202 203 def get_indexing(self) -> bool: 204 """ 205 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 206 returns False. 207 :return: The value of the indexing parameter. 208 """ 209 210 return self.get_param().get("indexing", False) 211 212 def get_connexion_config(self) -> dict: 213 """ 214 The function `get_connexion_config` returns a dictionary containing the configuration for a 215 connection, including the number of threads and memory limit. 216 :return: a dictionary containing the configuration for the Connexion library. 217 """ 218 219 # config 220 config = self.get_config() 221 222 # Connexion config 223 connexion_config = {} 224 threads = self.get_threads() 225 226 # Threads 227 if threads: 228 connexion_config["threads"] = threads 229 230 # Memory 231 # if config.get("memory", None): 232 # connexion_config["memory_limit"] = config.get("memory") 233 if self.get_memory(): 234 connexion_config["memory_limit"] = self.get_memory() 235 236 # Temporary directory 237 if config.get("tmp", None): 238 connexion_config["temp_directory"] = config.get("tmp") 239 240 # Access 241 if config.get("access", None): 242 access = config.get("access") 243 if access in ["RO"]: 244 access = "READ_ONLY" 245 elif access in ["RW"]: 246 access = "READ_WRITE" 247 connexion_db = self.get_connexion_db() 248 if connexion_db in ":memory:": 249 access = "READ_WRITE" 250 connexion_config["access_mode"] = access 251 252 return connexion_config 253 254 def get_duckdb_settings(self) -> dict: 255 """ 256 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 257 string. 258 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 259 """ 260 261 # config 262 config = self.get_config() 263 264 # duckdb settings 265 duckdb_settings_dict = {} 266 if config.get("duckdb_settings", None): 267 duckdb_settings = config.get("duckdb_settings") 268 duckdb_settings = full_path(duckdb_settings) 269 # duckdb setting is a file 270 if os.path.exists(duckdb_settings): 271 with open(duckdb_settings) as json_file: 272 duckdb_settings_dict = yaml.safe_load(json_file) 273 # duckdb settings is a string 274 else: 275 duckdb_settings_dict = json.loads(duckdb_settings) 276 277 return duckdb_settings_dict 278 279 def set_connexion_db(self) -> str: 280 """ 281 The function `set_connexion_db` returns the appropriate database connection string based on the 282 input format and connection type. 283 :return: the value of the variable `connexion_db`. 284 """ 285 286 # Default connexion db 287 default_connexion_db = ":memory:" 288 289 # Find connexion db 290 if self.get_input_format() in ["db", "duckdb"]: 291 connexion_db = self.get_input() 292 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 293 connexion_db = default_connexion_db 294 elif self.get_connexion_type() in ["tmpfile"]: 295 tmp_name = tempfile.mkdtemp( 296 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 297 ) 298 connexion_db = f"{tmp_name}/tmp.db" 299 elif self.get_connexion_type() != "": 300 connexion_db = self.get_connexion_type() 301 else: 302 connexion_db = default_connexion_db 303 304 # Set connexion db 305 self.connexion_db = connexion_db 306 307 return connexion_db 308 309 def set_connexion(self, conn) -> None: 310 """ 311 The function `set_connexion` creates a connection to a database, with options for different 312 database formats and settings. 313 314 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 315 database. If a connection is not provided, a new connection to an in-memory database is created. 316 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 317 sqlite 318 """ 319 320 # Connexion db 321 connexion_db = self.set_connexion_db() 322 323 # Connexion config 324 connexion_config = self.get_connexion_config() 325 326 # Connexion format 327 connexion_format = self.get_config().get("connexion_format", "duckdb") 328 # Set connexion format 329 self.connexion_format = connexion_format 330 331 # Connexion 332 if not conn: 333 if connexion_format in ["duckdb"]: 334 conn = duckdb.connect(connexion_db, config=connexion_config) 335 # duckDB settings 336 duckdb_settings = self.get_duckdb_settings() 337 if duckdb_settings: 338 for setting in duckdb_settings: 339 setting_value = duckdb_settings.get(setting) 340 if isinstance(setting_value, str): 341 setting_value = f"'{setting_value}'" 342 conn.execute(f"PRAGMA {setting}={setting_value};") 343 elif connexion_format in ["sqlite"]: 344 conn = sqlite3.connect(connexion_db) 345 346 # Set connexion 347 self.conn = conn 348 349 # Log 350 log.debug(f"connexion_format: {connexion_format}") 351 log.debug(f"connexion_db: {connexion_db}") 352 log.debug(f"connexion config: {connexion_config}") 353 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 354 355 def set_output(self, output: str = None) -> None: 356 """ 357 The `set_output` function in Python sets the output file based on the input or a specified key 358 in the config file, extracting the output name, extension, and format. 359 360 :param output: The `output` parameter in the `set_output` method is used to specify the name of 361 the output file. If the config file has an 'output' key, the method sets the output to the value 362 of that key. If no output is provided, it sets the output to `None` 363 :type output: str 364 """ 365 366 if output and not isinstance(output, str): 367 self.output = output.name 368 else: 369 self.output = output 370 371 # Output format 372 if self.output: 373 output_name, output_extension = os.path.splitext(self.output) 374 self.output_name = output_name 375 self.output_extension = output_extension 376 self.output_format = self.output_extension.replace(".", "") 377 else: 378 self.output_name = None 379 self.output_extension = None 380 self.output_format = None 381 382 def set_header(self) -> None: 383 """ 384 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 385 """ 386 387 input_file = self.get_input() 388 default_header_list = [ 389 "##fileformat=VCFv4.2", 390 "#CHROM POS ID REF ALT QUAL FILTER INFO", 391 ] 392 393 # Full path 394 input_file = full_path(input_file) 395 396 if input_file: 397 398 input_format = self.get_input_format() 399 input_compressed = self.get_input_compressed() 400 config = self.get_config() 401 header_list = default_header_list 402 if input_format in [ 403 "vcf", 404 "hdr", 405 "tsv", 406 "csv", 407 "psv", 408 "parquet", 409 "db", 410 "duckdb", 411 ]: 412 # header provided in param 413 if config.get("header_file", None): 414 with open(config.get("header_file"), "rt") as f: 415 header_list = self.read_vcf_header(f) 416 # within a vcf file format (header within input file itsself) 417 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 418 # within a compressed vcf file format (.vcf.gz) 419 if input_compressed: 420 with bgzf.open(input_file, "rt") as f: 421 header_list = self.read_vcf_header(f) 422 # within an uncompressed vcf file format (.vcf) 423 else: 424 with open(input_file, "rt") as f: 425 header_list = self.read_vcf_header(f) 426 # header provided in default external file .hdr 427 elif os.path.exists((input_file + ".hdr")): 428 with open(input_file + ".hdr", "rt") as f: 429 header_list = self.read_vcf_header(f) 430 else: 431 try: # Try to get header info fields and file columns 432 433 with tempfile.TemporaryDirectory() as tmpdir: 434 435 # Create database 436 db_for_header = Database(database=input_file) 437 438 # Get header columns for infos fields 439 db_header_from_columns = ( 440 db_for_header.get_header_from_columns() 441 ) 442 443 # Get real columns in the file 444 db_header_columns = db_for_header.get_columns() 445 446 # Write header file 447 header_file_tmp = os.path.join(tmpdir, "header") 448 f = open(header_file_tmp, "w") 449 vcf.Writer(f, db_header_from_columns) 450 f.close() 451 452 # Replace #CHROM line with rel columns 453 header_list = db_for_header.read_header_file( 454 header_file=header_file_tmp 455 ) 456 header_list[-1] = "\t".join(db_header_columns) 457 458 except: 459 460 log.warning( 461 f"No header for file {input_file}. Set as default VCF header" 462 ) 463 header_list = default_header_list 464 465 else: # try for unknown format ? 466 467 log.error(f"Input file format '{input_format}' not available") 468 raise ValueError(f"Input file format '{input_format}' not available") 469 470 if not header_list: 471 header_list = default_header_list 472 473 # header as list 474 self.header_list = header_list 475 476 # header as VCF object 477 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 478 479 else: 480 481 self.header_list = None 482 self.header_vcf = None 483 484 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 485 """ 486 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 487 DataFrame based on the connection format. 488 489 :param query: The `query` parameter in the `get_query_to_df` function is a string that 490 represents the SQL query you want to execute. This query will be used to fetch data from a 491 database and convert it into a pandas DataFrame 492 :type query: str 493 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 494 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 495 function will only fetch up to that number of rows from the database query result. If no limit 496 is specified, 497 :type limit: int 498 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 499 """ 500 501 # Connexion format 502 connexion_format = self.get_connexion_format() 503 504 # Limit in query 505 if limit: 506 pd.set_option("display.max_rows", limit) 507 if connexion_format in ["duckdb"]: 508 df = ( 509 self.conn.execute(query) 510 .fetch_record_batch(limit) 511 .read_next_batch() 512 .to_pandas() 513 ) 514 elif connexion_format in ["sqlite"]: 515 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 516 517 # Full query 518 else: 519 if connexion_format in ["duckdb"]: 520 df = self.conn.execute(query).df() 521 elif connexion_format in ["sqlite"]: 522 df = pd.read_sql_query(query, self.conn) 523 524 return df 525 526 def get_overview(self) -> None: 527 """ 528 The function prints the input, output, config, and dataframe of the current object 529 """ 530 table_variants_from = self.get_table_variants(clause="from") 531 sql_columns = self.get_header_columns_as_sql() 532 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 533 df = self.get_query_to_df(sql_query_export) 534 log.info( 535 "Input: " 536 + str(self.get_input()) 537 + " [" 538 + str(str(self.get_input_format())) 539 + "]" 540 ) 541 log.info( 542 "Output: " 543 + str(self.get_output()) 544 + " [" 545 + str(str(self.get_output_format())) 546 + "]" 547 ) 548 log.info("Config: ") 549 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 550 "\n" 551 ): 552 log.info("\t" + str(d)) 553 log.info("Param: ") 554 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 555 "\n" 556 ): 557 log.info("\t" + str(d)) 558 log.info("Sample list: " + str(self.get_header_sample_list())) 559 log.info("Dataframe: ") 560 for d in str(df).split("\n"): 561 log.info("\t" + str(d)) 562 563 # garbage collector 564 del df 565 gc.collect() 566 567 return None 568 569 def get_stats(self) -> dict: 570 """ 571 The `get_stats` function calculates and returns various statistics of the current object, 572 including information about the input file, variants, samples, header fields, quality, and 573 SNVs/InDels. 574 :return: a dictionary containing various statistics of the current object. The dictionary has 575 the following structure: 576 """ 577 578 # Log 579 log.info(f"Stats Calculation...") 580 581 # table varaints 582 table_variants_from = self.get_table_variants() 583 584 # stats dict 585 stats = {"Infos": {}} 586 587 ### File 588 input_file = self.get_input() 589 stats["Infos"]["Input file"] = input_file 590 591 # Header 592 header_infos = self.get_header().infos 593 header_formats = self.get_header().formats 594 header_infos_list = list(header_infos) 595 header_formats_list = list(header_formats) 596 597 ### Variants 598 599 stats["Variants"] = {} 600 601 # Variants by chr 602 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 603 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 604 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 605 by=["CHROM"], kind="quicksort" 606 ) 607 608 # Total number of variants 609 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 610 611 # Calculate percentage 612 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 613 lambda x: (x / nb_of_variants) 614 ) 615 616 stats["Variants"]["Number of variants by chromosome"] = ( 617 nb_of_variants_by_chrom.to_dict(orient="index") 618 ) 619 620 stats["Infos"]["Number of variants"] = int(nb_of_variants) 621 622 ### Samples 623 624 # Init 625 samples = {} 626 nb_of_samples = 0 627 628 # Check Samples 629 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 630 log.debug(f"Check samples...") 631 for sample in self.get_header_sample_list(): 632 sql_query_samples = f""" 633 SELECT '{sample}' as sample, 634 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 635 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 636 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 637 FROM {table_variants_from} 638 WHERE ( 639 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 640 AND 641 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 642 ) 643 GROUP BY genotype 644 """ 645 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 646 sample_genotype_count = sql_query_genotype_df["count"].sum() 647 if len(sql_query_genotype_df): 648 nb_of_samples += 1 649 samples[f"{sample} - {sample_genotype_count} variants"] = ( 650 sql_query_genotype_df.to_dict(orient="index") 651 ) 652 653 stats["Samples"] = samples 654 stats["Infos"]["Number of samples"] = nb_of_samples 655 656 # # 657 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 658 # stats["Infos"]["Number of samples"] = nb_of_samples 659 # elif nb_of_samples: 660 # stats["Infos"]["Number of samples"] = "not a VCF format" 661 662 ### INFO and FORMAT fields 663 header_types_df = {} 664 header_types_list = { 665 "List of INFO fields": header_infos, 666 "List of FORMAT fields": header_formats, 667 } 668 i = 0 669 for header_type in header_types_list: 670 671 header_type_infos = header_types_list.get(header_type) 672 header_infos_dict = {} 673 674 for info in header_type_infos: 675 676 i += 1 677 header_infos_dict[i] = {} 678 679 # ID 680 header_infos_dict[i]["id"] = info 681 682 # num 683 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 684 if header_type_infos[info].num in genotype_map.keys(): 685 header_infos_dict[i]["Number"] = genotype_map.get( 686 header_type_infos[info].num 687 ) 688 else: 689 header_infos_dict[i]["Number"] = header_type_infos[info].num 690 691 # type 692 if header_type_infos[info].type: 693 header_infos_dict[i]["Type"] = header_type_infos[info].type 694 else: 695 header_infos_dict[i]["Type"] = "." 696 697 # desc 698 if header_type_infos[info].desc != None: 699 header_infos_dict[i]["Description"] = header_type_infos[info].desc 700 else: 701 header_infos_dict[i]["Description"] = "" 702 703 if len(header_infos_dict): 704 header_types_df[header_type] = pd.DataFrame.from_dict( 705 header_infos_dict, orient="index" 706 ).to_dict(orient="index") 707 708 # Stats 709 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 710 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 711 stats["Header"] = header_types_df 712 713 ### QUAL 714 if "QUAL" in self.get_header_columns(): 715 sql_query_qual = f""" 716 SELECT 717 avg(CAST(QUAL AS INTEGER)) AS Average, 718 min(CAST(QUAL AS INTEGER)) AS Minimum, 719 max(CAST(QUAL AS INTEGER)) AS Maximum, 720 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 721 median(CAST(QUAL AS INTEGER)) AS Median, 722 variance(CAST(QUAL AS INTEGER)) AS Variance 723 FROM {table_variants_from} 724 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 725 """ 726 727 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 728 stats["Quality"] = {"Stats": qual} 729 730 ### SNV and InDel 731 732 sql_query_snv = f""" 733 734 SELECT Type, count FROM ( 735 736 SELECT 737 'Total' AS Type, 738 count(*) AS count 739 FROM {table_variants_from} 740 741 UNION 742 743 SELECT 744 'MNV' AS Type, 745 count(*) AS count 746 FROM {table_variants_from} 747 WHERE len(REF) > 1 AND len(ALT) > 1 748 AND len(REF) = len(ALT) 749 750 UNION 751 752 SELECT 753 'InDel' AS Type, 754 count(*) AS count 755 FROM {table_variants_from} 756 WHERE len(REF) > 1 OR len(ALT) > 1 757 AND len(REF) != len(ALT) 758 759 UNION 760 761 SELECT 762 'SNV' AS Type, 763 count(*) AS count 764 FROM {table_variants_from} 765 WHERE len(REF) = 1 AND len(ALT) = 1 766 767 ) 768 769 ORDER BY count DESC 770 771 """ 772 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 773 774 sql_query_snv_substitution = f""" 775 SELECT 776 concat(REF, '>', ALT) AS 'Substitution', 777 count(*) AS count 778 FROM {table_variants_from} 779 WHERE len(REF) = 1 AND len(ALT) = 1 780 GROUP BY REF, ALT 781 ORDER BY count(*) DESC 782 """ 783 snv_substitution = ( 784 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 785 ) 786 stats["Variants"]["Counts"] = snv_indel 787 stats["Variants"]["Substitutions"] = snv_substitution 788 789 return stats 790 791 def stats_to_file(self, file: str = None) -> str: 792 """ 793 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 794 into a JSON object, and writes the JSON object to the specified file. 795 796 :param file: The `file` parameter is a string that represents the file path where the JSON data 797 will be written 798 :type file: str 799 :return: the name of the file that was written to. 800 """ 801 802 # Get stats 803 stats = self.get_stats() 804 805 # Serializing json 806 json_object = json.dumps(stats, indent=4) 807 808 # Writing to sample.json 809 with open(file, "w") as outfile: 810 outfile.write(json_object) 811 812 return file 813 814 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 815 """ 816 The `print_stats` function generates a markdown file and prints the statistics contained in a 817 JSON file in a formatted manner. 818 819 :param output_file: The `output_file` parameter is a string that specifies the path and filename 820 of the output file where the stats will be printed in Markdown format. If no `output_file` is 821 provided, a temporary directory will be created and the stats will be saved in a file named 822 "stats.md" within that 823 :type output_file: str 824 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 825 file where the statistics will be saved. If no value is provided, a temporary directory will be 826 created and a default file name "stats.json" will be used 827 :type json_file: str 828 :return: The function `print_stats` does not return any value. It has a return type annotation 829 of `None`. 830 """ 831 832 # Full path 833 output_file = full_path(output_file) 834 json_file = full_path(json_file) 835 836 with tempfile.TemporaryDirectory() as tmpdir: 837 838 # Files 839 if not output_file: 840 output_file = os.path.join(tmpdir, "stats.md") 841 if not json_file: 842 json_file = os.path.join(tmpdir, "stats.json") 843 844 # Create folders 845 if not os.path.exists(os.path.dirname(output_file)): 846 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 847 if not os.path.exists(os.path.dirname(json_file)): 848 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 849 850 # Create stats JSON file 851 stats_file = self.stats_to_file(file=json_file) 852 853 # Print stats file 854 with open(stats_file) as f: 855 stats = yaml.safe_load(f) 856 857 # Output 858 output_title = [] 859 output_index = [] 860 output = [] 861 862 # Title 863 output_title.append("# HOWARD Stats") 864 865 # Index 866 output_index.append("## Index") 867 868 # Process sections 869 for section in stats: 870 infos = stats.get(section) 871 section_link = "#" + section.lower().replace(" ", "-") 872 output.append(f"## {section}") 873 output_index.append(f"- [{section}]({section_link})") 874 875 if len(infos): 876 for info in infos: 877 try: 878 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 879 is_df = True 880 except: 881 try: 882 df = pd.DataFrame.from_dict( 883 json.loads((infos.get(info))), orient="index" 884 ) 885 is_df = True 886 except: 887 is_df = False 888 if is_df: 889 output.append(f"### {info}") 890 info_link = "#" + info.lower().replace(" ", "-") 891 output_index.append(f" - [{info}]({info_link})") 892 output.append(f"{df.to_markdown(index=False)}") 893 else: 894 output.append(f"- {info}: {infos.get(info)}") 895 else: 896 output.append(f"NA") 897 898 # Write stats in markdown file 899 with open(output_file, "w") as fp: 900 for item in output_title: 901 fp.write("%s\n" % item) 902 for item in output_index: 903 fp.write("%s\n" % item) 904 for item in output: 905 fp.write("%s\n" % item) 906 907 # Output stats in markdown 908 print("") 909 print("\n\n".join(output_title)) 910 print("") 911 print("\n\n".join(output)) 912 print("") 913 914 return None 915 916 def get_input(self) -> str: 917 """ 918 It returns the value of the input variable. 919 :return: The input is being returned. 920 """ 921 return self.input 922 923 def get_input_format(self, input_file: str = None) -> str: 924 """ 925 This function returns the format of the input variable, either from the provided input file or 926 by prompting for input. 927 928 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 929 represents the file path of the input file. If no `input_file` is provided when calling the 930 method, it will default to `None` 931 :type input_file: str 932 :return: The format of the input variable is being returned. 933 """ 934 935 if not input_file: 936 input_file = self.get_input() 937 input_format = get_file_format(input_file) 938 return input_format 939 940 def get_input_compressed(self, input_file: str = None) -> str: 941 """ 942 The function `get_input_compressed` returns the format of the input variable after compressing 943 it. 944 945 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 946 that represents the file path of the input file. If no `input_file` is provided when calling the 947 method, it will default to `None` and the method will then call `self.get_input()` to 948 :type input_file: str 949 :return: The function `get_input_compressed` returns the compressed format of the input 950 variable. 951 """ 952 953 if not input_file: 954 input_file = self.get_input() 955 input_compressed = get_file_compressed(input_file) 956 return input_compressed 957 958 def get_output(self) -> str: 959 """ 960 It returns the output of the neuron. 961 :return: The output of the neural network. 962 """ 963 964 return self.output 965 966 def get_output_format(self, output_file: str = None) -> str: 967 """ 968 The function `get_output_format` returns the format of the input variable or the output file if 969 provided. 970 971 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 972 that represents the file path of the output file. If no `output_file` is provided when calling 973 the method, it will default to the output obtained from the `get_output` method of the class 974 instance. The 975 :type output_file: str 976 :return: The format of the input variable is being returned. 977 """ 978 979 if not output_file: 980 output_file = self.get_output() 981 output_format = get_file_format(output_file) 982 983 return output_format 984 985 def get_config(self) -> dict: 986 """ 987 It returns the config 988 :return: The config variable is being returned. 989 """ 990 return self.config 991 992 def get_param(self) -> dict: 993 """ 994 It returns the param 995 :return: The param variable is being returned. 996 """ 997 return self.param 998 999 def get_connexion_db(self) -> str: 1000 """ 1001 It returns the connexion_db attribute of the object 1002 :return: The connexion_db is being returned. 1003 """ 1004 return self.connexion_db 1005 1006 def get_prefix(self) -> str: 1007 """ 1008 It returns the prefix of the object. 1009 :return: The prefix is being returned. 1010 """ 1011 return self.prefix 1012 1013 def get_table_variants(self, clause: str = "select") -> str: 1014 """ 1015 This function returns the table_variants attribute of the object 1016 1017 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1018 defaults to select (optional) 1019 :return: The table_variants attribute of the object. 1020 """ 1021 1022 # Access 1023 access = self.get_config().get("access", None) 1024 1025 # Clauses "select", "where", "update" 1026 if clause in ["select", "where", "update"]: 1027 table_variants = self.table_variants 1028 # Clause "from" 1029 elif clause in ["from"]: 1030 # For Read Only 1031 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1032 input_file = self.get_input() 1033 table_variants = f"'{input_file}' as variants" 1034 # For Read Write 1035 else: 1036 table_variants = f"{self.table_variants} as variants" 1037 else: 1038 table_variants = self.table_variants 1039 return table_variants 1040 1041 def get_tmp_dir(self) -> str: 1042 """ 1043 The function `get_tmp_dir` returns the temporary directory path based on configuration 1044 parameters or a default path. 1045 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1046 configuration, parameters, and a default value of "/tmp". 1047 """ 1048 1049 return get_tmp( 1050 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1051 ) 1052 1053 def get_connexion_type(self) -> str: 1054 """ 1055 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1056 1057 :return: The connexion type is being returned. 1058 """ 1059 return self.get_config().get("connexion_type", "memory") 1060 1061 def get_connexion(self): 1062 """ 1063 It returns the connection object 1064 1065 :return: The connection object. 1066 """ 1067 return self.conn 1068 1069 def close_connexion(self) -> None: 1070 """ 1071 This function closes the connection to the database. 1072 :return: The connection is being closed. 1073 """ 1074 return self.conn.close() 1075 1076 def get_header(self, type: str = "vcf"): 1077 """ 1078 This function returns the header of the VCF file as a list of strings 1079 1080 :param type: the type of header you want to get, defaults to vcf (optional) 1081 :return: The header of the vcf file. 1082 """ 1083 1084 if self.header_vcf: 1085 if type == "vcf": 1086 return self.header_vcf 1087 elif type == "list": 1088 return self.header_list 1089 else: 1090 if type == "vcf": 1091 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1092 return header 1093 elif type == "list": 1094 return vcf_required 1095 1096 def get_header_infos_list(self) -> list: 1097 """ 1098 This function retrieves a list of information fields from the header. 1099 :return: A list of information fields from the header. 1100 """ 1101 1102 # Init 1103 infos_list = [] 1104 1105 for field in self.get_header().infos: 1106 infos_list.append(field) 1107 1108 return infos_list 1109 1110 def get_header_length(self, file: str = None) -> int: 1111 """ 1112 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1113 line. 1114 1115 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1116 header file. If this argument is provided, the function will read the header from the specified 1117 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1118 :type file: str 1119 :return: the length of the header list, excluding the #CHROM line. 1120 """ 1121 1122 if file: 1123 return len(self.read_vcf_header_file(file=file)) - 1 1124 elif self.get_header(type="list"): 1125 return len(self.get_header(type="list")) - 1 1126 else: 1127 return 0 1128 1129 def get_header_columns(self) -> str: 1130 """ 1131 This function returns the header list of a VCF 1132 1133 :return: The length of the header list. 1134 """ 1135 if self.get_header(): 1136 return self.get_header(type="list")[-1] 1137 else: 1138 return "" 1139 1140 def get_header_columns_as_list(self) -> list: 1141 """ 1142 This function returns the header list of a VCF 1143 1144 :return: The length of the header list. 1145 """ 1146 if self.get_header(): 1147 return self.get_header_columns().strip().split("\t") 1148 else: 1149 return [] 1150 1151 def get_header_columns_as_sql(self) -> str: 1152 """ 1153 This function retruns header length (without #CHROM line) 1154 1155 :return: The length of the header list. 1156 """ 1157 sql_column_list = [] 1158 for col in self.get_header_columns_as_list(): 1159 sql_column_list.append(f'"{col}"') 1160 return ",".join(sql_column_list) 1161 1162 def get_header_sample_list( 1163 self, check: bool = False, samples: list = None, samples_force: bool = False 1164 ) -> list: 1165 """ 1166 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1167 checking and filtering based on input parameters. 1168 1169 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1170 parameter that determines whether to check if the samples in the list are properly defined as 1171 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1172 list is defined as a, defaults to False 1173 :type check: bool (optional) 1174 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1175 allows you to specify a subset of samples from the header. If you provide a list of sample 1176 names, the function will check if each sample is defined in the header. If a sample is not found 1177 in the 1178 :type samples: list 1179 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1180 a boolean parameter that determines whether to force the function to return the sample list 1181 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1182 function will return the sample list without performing, defaults to False 1183 :type samples_force: bool (optional) 1184 :return: The function `get_header_sample_list` returns a list of samples based on the input 1185 parameters and conditions specified in the function. 1186 """ 1187 1188 # Init 1189 samples_list = [] 1190 1191 if samples is None: 1192 samples_list = self.header_vcf.samples 1193 else: 1194 samples_checked = [] 1195 for sample in samples: 1196 if sample in self.header_vcf.samples: 1197 samples_checked.append(sample) 1198 else: 1199 log.warning(f"Sample '{sample}' not defined in header") 1200 samples_list = samples_checked 1201 1202 # Force sample list without checking if is_genotype_column 1203 if samples_force: 1204 log.warning(f"Samples {samples_list} not checked if genotypes") 1205 return samples_list 1206 1207 if check: 1208 samples_checked = [] 1209 for sample in samples_list: 1210 if self.is_genotype_column(column=sample): 1211 samples_checked.append(sample) 1212 else: 1213 log.warning( 1214 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1215 ) 1216 samples_list = samples_checked 1217 1218 # Return samples list 1219 return samples_list 1220 1221 def is_genotype_column(self, column: str = None) -> bool: 1222 """ 1223 This function checks if a given column is a genotype column in a database. 1224 1225 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1226 represents the column name in a database table. This method checks if the specified column is a 1227 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1228 method of 1229 :type column: str 1230 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1231 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1232 column name and returns the result. If the `column` parameter is None, it returns False. 1233 """ 1234 1235 if column is not None: 1236 return Database(database=self.get_input()).is_genotype_column(column=column) 1237 else: 1238 return False 1239 1240 def get_verbose(self) -> bool: 1241 """ 1242 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1243 exist 1244 1245 :return: The value of the key "verbose" in the config dictionary. 1246 """ 1247 return self.get_config().get("verbose", False) 1248 1249 def get_connexion_format(self) -> str: 1250 """ 1251 It returns the connexion format of the object. 1252 :return: The connexion_format is being returned. 1253 """ 1254 connexion_format = self.connexion_format 1255 if connexion_format not in ["duckdb", "sqlite"]: 1256 log.error(f"Unknown connexion format {connexion_format}") 1257 raise ValueError(f"Unknown connexion format {connexion_format}") 1258 else: 1259 return connexion_format 1260 1261 def insert_file_to_table( 1262 self, 1263 file, 1264 columns: str, 1265 header_len: int = 0, 1266 sep: str = "\t", 1267 chunksize: int = 1000000, 1268 ) -> None: 1269 """ 1270 The function reads a file in chunks and inserts each chunk into a table based on the specified 1271 database format. 1272 1273 :param file: The `file` parameter is the file that you want to load into a table. It should be 1274 the path to the file on your system 1275 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1276 should contain the names of the columns in the table where the data will be inserted. The column 1277 names should be separated by commas within the string. For example, if you have columns named 1278 "id", "name 1279 :type columns: str 1280 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1281 the number of lines to skip at the beginning of the file before reading the actual data. This 1282 parameter allows you to skip any header information present in the file before processing the 1283 data, defaults to 0 1284 :type header_len: int (optional) 1285 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1286 separator character that is used in the file being read. In this case, the default separator is 1287 set to `\t`, which represents a tab character. You can change this parameter to a different 1288 separator character if, defaults to \t 1289 :type sep: str (optional) 1290 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1291 when processing the file in chunks. In the provided code snippet, the default value for 1292 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1293 to 1000000 1294 :type chunksize: int (optional) 1295 """ 1296 1297 # Config 1298 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1299 connexion_format = self.get_connexion_format() 1300 1301 log.debug("chunksize: " + str(chunksize)) 1302 1303 if chunksize: 1304 for chunk in pd.read_csv( 1305 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1306 ): 1307 if connexion_format in ["duckdb"]: 1308 sql_insert_into = ( 1309 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1310 ) 1311 self.conn.execute(sql_insert_into) 1312 elif connexion_format in ["sqlite"]: 1313 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1314 1315 def load_data( 1316 self, 1317 input_file: str = None, 1318 drop_variants_table: bool = False, 1319 sample_size: int = 20480, 1320 ) -> None: 1321 """ 1322 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1323 table before loading the data and specify a sample size. 1324 1325 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1326 table 1327 :type input_file: str 1328 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1329 determines whether the variants table should be dropped before loading the data. If set to 1330 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1331 not be dropped, defaults to False 1332 :type drop_variants_table: bool (optional) 1333 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1334 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1335 20480 1336 :type sample_size: int (optional) 1337 """ 1338 1339 log.info("Loading...") 1340 1341 # change input file 1342 if input_file: 1343 self.set_input(input_file) 1344 self.set_header() 1345 1346 # drop variants table 1347 if drop_variants_table: 1348 self.drop_variants_table() 1349 1350 # get table variants 1351 table_variants = self.get_table_variants() 1352 1353 # Access 1354 access = self.get_config().get("access", None) 1355 log.debug(f"access: {access}") 1356 1357 # Input format and compress 1358 input_format = self.get_input_format() 1359 input_compressed = self.get_input_compressed() 1360 log.debug(f"input_format: {input_format}") 1361 log.debug(f"input_compressed: {input_compressed}") 1362 1363 # input_compressed_format 1364 if input_compressed: 1365 input_compressed_format = "gzip" 1366 else: 1367 input_compressed_format = "none" 1368 log.debug(f"input_compressed_format: {input_compressed_format}") 1369 1370 # Connexion format 1371 connexion_format = self.get_connexion_format() 1372 1373 # Sample size 1374 if not sample_size: 1375 sample_size = -1 1376 log.debug(f"sample_size: {sample_size}") 1377 1378 # Load data 1379 log.debug(f"Load Data from {input_format}") 1380 1381 # DuckDB connexion 1382 if connexion_format in ["duckdb"]: 1383 1384 # Database already exists 1385 if self.input_format in ["db", "duckdb"]: 1386 1387 if connexion_format in ["duckdb"]: 1388 log.debug(f"Input file format '{self.input_format}' duckDB") 1389 else: 1390 log.error( 1391 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1392 ) 1393 raise ValueError( 1394 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1395 ) 1396 1397 # Load from existing database format 1398 else: 1399 1400 try: 1401 # Create Table or View 1402 database = Database(database=self.input) 1403 sql_from = database.get_sql_from(sample_size=sample_size) 1404 1405 if access in ["RO"]: 1406 sql_load = ( 1407 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1408 ) 1409 else: 1410 sql_load = ( 1411 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1412 ) 1413 self.conn.execute(sql_load) 1414 1415 except: 1416 # Format not available 1417 log.error(f"Input file format '{self.input_format}' not available") 1418 raise ValueError( 1419 f"Input file format '{self.input_format}' not available" 1420 ) 1421 1422 # SQLite connexion 1423 elif connexion_format in ["sqlite"] and input_format in [ 1424 "vcf", 1425 "tsv", 1426 "csv", 1427 "psv", 1428 ]: 1429 1430 # Main structure 1431 structure = { 1432 "#CHROM": "VARCHAR", 1433 "POS": "INTEGER", 1434 "ID": "VARCHAR", 1435 "REF": "VARCHAR", 1436 "ALT": "VARCHAR", 1437 "QUAL": "VARCHAR", 1438 "FILTER": "VARCHAR", 1439 "INFO": "VARCHAR", 1440 } 1441 1442 # Strcuture with samples 1443 structure_complete = structure 1444 if self.get_header_sample_list(): 1445 structure["FORMAT"] = "VARCHAR" 1446 for sample in self.get_header_sample_list(): 1447 structure_complete[sample] = "VARCHAR" 1448 1449 # Columns list for create and insert 1450 sql_create_table_columns = [] 1451 sql_create_table_columns_list = [] 1452 for column in structure_complete: 1453 column_type = structure_complete[column] 1454 sql_create_table_columns.append( 1455 f'"{column}" {column_type} default NULL' 1456 ) 1457 sql_create_table_columns_list.append(f'"{column}"') 1458 1459 # Create database 1460 log.debug(f"Create Table {table_variants}") 1461 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1462 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1463 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1464 self.conn.execute(sql_create_table) 1465 1466 # chunksize define length of file chunk load file 1467 chunksize = 100000 1468 1469 # delimiter 1470 delimiter = file_format_delimiters.get(input_format, "\t") 1471 1472 # Load the input file 1473 with open(self.input, "rt") as input_file: 1474 1475 # Use the appropriate file handler based on the input format 1476 if input_compressed: 1477 input_file = bgzf.open(self.input, "rt") 1478 if input_format in ["vcf"]: 1479 header_len = self.get_header_length() 1480 else: 1481 header_len = 0 1482 1483 # Insert the file contents into a table 1484 self.insert_file_to_table( 1485 input_file, 1486 columns=sql_create_table_columns_list_sql, 1487 header_len=header_len, 1488 sep=delimiter, 1489 chunksize=chunksize, 1490 ) 1491 1492 else: 1493 log.error( 1494 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1495 ) 1496 raise ValueError( 1497 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1498 ) 1499 1500 # Explode INFOS fields into table fields 1501 if self.get_explode_infos(): 1502 self.explode_infos( 1503 prefix=self.get_explode_infos_prefix(), 1504 fields=self.get_explode_infos_fields(), 1505 force=True, 1506 ) 1507 1508 # Create index after insertion 1509 self.create_indexes() 1510 1511 def get_explode_infos(self) -> bool: 1512 """ 1513 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1514 to False if it is not set. 1515 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1516 value. If the parameter is not present, it will return False. 1517 """ 1518 1519 return self.get_param().get("explode", {}).get("explode_infos", False) 1520 1521 def get_explode_infos_fields( 1522 self, 1523 explode_infos_fields: str = None, 1524 remove_fields_not_in_header: bool = False, 1525 ) -> list: 1526 """ 1527 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1528 the input parameter `explode_infos_fields`. 1529 1530 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1531 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1532 comma-separated list of field names to explode 1533 :type explode_infos_fields: str 1534 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1535 flag that determines whether to remove fields that are not present in the header. If it is set 1536 to `True`, any field that is not in the header will be excluded from the list of exploded 1537 information fields. If it is set to `, defaults to False 1538 :type remove_fields_not_in_header: bool (optional) 1539 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1540 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1541 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1542 Otherwise, it returns a list of exploded information fields after removing any spaces and 1543 splitting the string by commas. 1544 """ 1545 1546 # If no fields, get it in param 1547 if not explode_infos_fields: 1548 explode_infos_fields = ( 1549 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1550 ) 1551 1552 # If no fields, defined as all fields in header using keyword 1553 if not explode_infos_fields: 1554 explode_infos_fields = "*" 1555 1556 # If fields list not empty 1557 if explode_infos_fields: 1558 1559 # Input fields list 1560 if isinstance(explode_infos_fields, str): 1561 fields_input = explode_infos_fields.split(",") 1562 elif isinstance(explode_infos_fields, list): 1563 fields_input = explode_infos_fields 1564 else: 1565 fields_input = [] 1566 1567 # Fields list without * keyword 1568 fields_without_all = fields_input.copy() 1569 if "*".casefold() in (item.casefold() for item in fields_without_all): 1570 fields_without_all.remove("*") 1571 1572 # Fields in header 1573 fields_in_header = sorted(list(set(self.get_header().infos))) 1574 1575 # Construct list of fields 1576 fields_output = [] 1577 for field in fields_input: 1578 1579 # Strip field 1580 field = field.strip() 1581 1582 # format keyword * in regex 1583 if field.upper() in ["*"]: 1584 field = ".*" 1585 1586 # Find all fields with pattern 1587 r = re.compile(field) 1588 fields_search = sorted(list(filter(r.match, fields_in_header))) 1589 1590 # Remove fields input from search 1591 if field in fields_search: 1592 fields_search = [field] 1593 elif fields_search != [field]: 1594 fields_search = sorted( 1595 list(set(fields_search).difference(fields_input)) 1596 ) 1597 1598 # If field is not in header (avoid not well formatted header) 1599 if not fields_search and not remove_fields_not_in_header: 1600 fields_search = [field] 1601 1602 # Add found fields 1603 for new_field in fields_search: 1604 # Add field, if not already exists, and if it is in header (if asked) 1605 if ( 1606 new_field not in fields_output 1607 and ( 1608 not remove_fields_not_in_header 1609 or new_field in fields_in_header 1610 ) 1611 and new_field not in [".*"] 1612 ): 1613 fields_output.append(new_field) 1614 1615 return fields_output 1616 1617 else: 1618 1619 return [] 1620 1621 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1622 """ 1623 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1624 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1625 not provided. 1626 1627 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1628 prefix to be used for exploding or expanding information 1629 :type explode_infos_prefix: str 1630 :return: the value of the variable `explode_infos_prefix`. 1631 """ 1632 1633 if not explode_infos_prefix: 1634 explode_infos_prefix = ( 1635 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1636 ) 1637 1638 return explode_infos_prefix 1639 1640 def add_column( 1641 self, 1642 table_name, 1643 column_name, 1644 column_type, 1645 default_value=None, 1646 drop: bool = False, 1647 ) -> dict: 1648 """ 1649 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1650 doesn't already exist. 1651 1652 :param table_name: The name of the table to which you want to add a column 1653 :param column_name: The parameter "column_name" is the name of the column that you want to add 1654 to the table 1655 :param column_type: The `column_type` parameter specifies the data type of the column that you 1656 want to add to the table. It should be a string that represents the desired data type, such as 1657 "INTEGER", "TEXT", "REAL", etc 1658 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1659 default value for the newly added column. If a default value is provided, it will be assigned to 1660 the column for any existing rows that do not have a value for that column 1661 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1662 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1663 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1664 to False 1665 :type drop: bool (optional) 1666 :return: a boolean value indicating whether the column was successfully added to the table. 1667 """ 1668 1669 # added 1670 added = False 1671 dropped = False 1672 1673 # Check if the column already exists in the table 1674 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1675 columns = self.get_query_to_df(query).columns.tolist() 1676 if column_name.upper() in [c.upper() for c in columns]: 1677 log.debug( 1678 f"The {column_name} column already exists in the {table_name} table" 1679 ) 1680 if drop: 1681 self.drop_column(table_name=table_name, column_name=column_name) 1682 dropped = True 1683 else: 1684 return None 1685 else: 1686 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1687 1688 # Add column in table 1689 add_column_query = ( 1690 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1691 ) 1692 if default_value is not None: 1693 add_column_query += f" DEFAULT {default_value}" 1694 self.execute_query(add_column_query) 1695 added = not dropped 1696 log.debug( 1697 f"The {column_name} column was successfully added to the {table_name} table" 1698 ) 1699 1700 if added: 1701 added_column = { 1702 "table_name": table_name, 1703 "column_name": column_name, 1704 "column_type": column_type, 1705 "default_value": default_value, 1706 } 1707 else: 1708 added_column = None 1709 1710 return added_column 1711 1712 def drop_column( 1713 self, column: dict = None, table_name: str = None, column_name: str = None 1714 ) -> bool: 1715 """ 1716 The `drop_column` function drops a specified column from a given table in a database and returns 1717 True if the column was successfully dropped, and False if the column does not exist in the 1718 table. 1719 1720 :param column: The `column` parameter is a dictionary that contains information about the column 1721 you want to drop. It has two keys: 1722 :type column: dict 1723 :param table_name: The `table_name` parameter is the name of the table from which you want to 1724 drop a column 1725 :type table_name: str 1726 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1727 from the table 1728 :type column_name: str 1729 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1730 and False if the column does not exist in the table. 1731 """ 1732 1733 # Find column infos 1734 if column: 1735 if isinstance(column, dict): 1736 table_name = column.get("table_name", None) 1737 column_name = column.get("column_name", None) 1738 elif isinstance(column, str): 1739 table_name = self.get_table_variants() 1740 column_name = column 1741 else: 1742 table_name = None 1743 column_name = None 1744 1745 if not table_name and not column_name: 1746 return False 1747 1748 # Removed 1749 removed = False 1750 1751 # Check if the column already exists in the table 1752 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1753 columns = self.get_query_to_df(query).columns.tolist() 1754 if column_name in columns: 1755 log.debug(f"The {column_name} column exists in the {table_name} table") 1756 else: 1757 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1758 return False 1759 1760 # Add column in table # ALTER TABLE integers DROP k 1761 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1762 self.execute_query(add_column_query) 1763 removed = True 1764 log.debug( 1765 f"The {column_name} column was successfully dropped to the {table_name} table" 1766 ) 1767 1768 return removed 1769 1770 def explode_infos( 1771 self, 1772 prefix: str = None, 1773 create_index: bool = False, 1774 fields: list = None, 1775 force: bool = False, 1776 proccess_all_fields_together: bool = False, 1777 table: str = None, 1778 ) -> list: 1779 """ 1780 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1781 individual columns, returning a list of added columns. 1782 1783 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1784 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1785 `self.get_explode_infos_prefix()` as the prefix 1786 :type prefix: str 1787 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1788 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1789 `False`, indexes will not be created. The default value is `False`, defaults to False 1790 :type create_index: bool (optional) 1791 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1792 that you want to explode into individual columns. If this parameter is not provided, all INFO 1793 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1794 a list to the ` 1795 :type fields: list 1796 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1797 determines whether to drop and recreate a column if it already exists in the table. If `force` 1798 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1799 defaults to False 1800 :type force: bool (optional) 1801 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1802 flag that determines whether to process all the INFO fields together or individually. If set to 1803 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1804 be processed individually. The default value is, defaults to False 1805 :type proccess_all_fields_together: bool (optional) 1806 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1807 of the table where the exploded INFO fields will be added as individual columns. If you provide 1808 a value for the `table` parameter, the function will use that table name. If the `table` 1809 parameter is 1810 :type table: str 1811 :return: The `explode_infos` function returns a list of added columns. 1812 """ 1813 1814 # drop indexes 1815 self.drop_indexes() 1816 1817 # connexion format 1818 connexion_format = self.get_connexion_format() 1819 1820 # Access 1821 access = self.get_config().get("access", None) 1822 1823 # Added columns 1824 added_columns = [] 1825 1826 if access not in ["RO"]: 1827 1828 # prefix 1829 if prefix in [None, True] or not isinstance(prefix, str): 1830 if self.get_explode_infos_prefix() not in [None, True]: 1831 prefix = self.get_explode_infos_prefix() 1832 else: 1833 prefix = "INFO/" 1834 1835 # table variants 1836 if table is not None: 1837 table_variants = table 1838 else: 1839 table_variants = self.get_table_variants(clause="select") 1840 1841 # extra infos 1842 try: 1843 extra_infos = self.get_extra_infos() 1844 except: 1845 extra_infos = [] 1846 1847 # Header infos 1848 header_infos = self.get_header().infos 1849 1850 log.debug( 1851 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1852 ) 1853 1854 sql_info_alter_table_array = [] 1855 1856 # Info fields to check 1857 fields_list = list(header_infos) 1858 if fields: 1859 fields_list += fields 1860 fields_list = set(fields_list) 1861 1862 # If no fields 1863 if not fields: 1864 fields = [] 1865 1866 # Translate fields if patterns 1867 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1868 1869 for info in fields: 1870 1871 info_id_sql = prefix + info 1872 1873 if ( 1874 info in fields_list 1875 or prefix + info in fields_list 1876 or info in extra_infos 1877 ): 1878 1879 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1880 1881 if info in header_infos: 1882 info_type = header_infos[info].type 1883 info_num = header_infos[info].num 1884 else: 1885 info_type = "String" 1886 info_num = 0 1887 1888 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1889 if info_num != 1: 1890 type_sql = "VARCHAR" 1891 1892 # Add field 1893 added_column = self.add_column( 1894 table_name=table_variants, 1895 column_name=info_id_sql, 1896 column_type=type_sql, 1897 default_value="null", 1898 drop=force, 1899 ) 1900 1901 if added_column: 1902 added_columns.append(added_column) 1903 1904 if added_column or force: 1905 1906 # add field to index 1907 self.index_additionnal_fields.append(info_id_sql) 1908 1909 # Update field array 1910 if connexion_format in ["duckdb"]: 1911 update_info_field = f""" 1912 "{info_id_sql}" = 1913 CASE 1914 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1915 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1916 END 1917 """ 1918 elif connexion_format in ["sqlite"]: 1919 update_info_field = f""" 1920 "{info_id_sql}" = 1921 CASE 1922 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1923 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1924 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1925 END 1926 """ 1927 1928 sql_info_alter_table_array.append(update_info_field) 1929 1930 if sql_info_alter_table_array: 1931 1932 # By chromosomes 1933 try: 1934 chromosomes_list = list( 1935 self.get_query_to_df( 1936 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1937 )["#CHROM"] 1938 ) 1939 except: 1940 chromosomes_list = [None] 1941 1942 for chrom in chromosomes_list: 1943 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1944 1945 # Where clause 1946 where_clause = "" 1947 if chrom and len(chromosomes_list) > 1: 1948 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1949 1950 # Update table 1951 if proccess_all_fields_together: 1952 sql_info_alter_table_array_join = ", ".join( 1953 sql_info_alter_table_array 1954 ) 1955 if sql_info_alter_table_array_join: 1956 sql_info_alter_table = f""" 1957 UPDATE {table_variants} 1958 SET {sql_info_alter_table_array_join} 1959 {where_clause} 1960 """ 1961 log.debug( 1962 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1963 ) 1964 # log.debug(sql_info_alter_table) 1965 self.conn.execute(sql_info_alter_table) 1966 else: 1967 sql_info_alter_num = 0 1968 for sql_info_alter in sql_info_alter_table_array: 1969 sql_info_alter_num += 1 1970 sql_info_alter_table = f""" 1971 UPDATE {table_variants} 1972 SET {sql_info_alter} 1973 {where_clause} 1974 """ 1975 log.debug( 1976 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1977 ) 1978 # log.debug(sql_info_alter_table) 1979 self.conn.execute(sql_info_alter_table) 1980 1981 # create indexes 1982 if create_index: 1983 self.create_indexes() 1984 1985 return added_columns 1986 1987 def create_indexes(self) -> None: 1988 """ 1989 Create indexes on the table after insertion 1990 """ 1991 1992 # Access 1993 access = self.get_config().get("access", None) 1994 1995 # get table variants 1996 table_variants = self.get_table_variants("FROM") 1997 1998 if self.get_indexing() and access not in ["RO"]: 1999 # Create index 2000 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2001 self.conn.execute(sql_create_table_index) 2002 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2003 self.conn.execute(sql_create_table_index) 2004 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2005 self.conn.execute(sql_create_table_index) 2006 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2007 self.conn.execute(sql_create_table_index) 2008 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2009 self.conn.execute(sql_create_table_index) 2010 for field in self.index_additionnal_fields: 2011 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2012 self.conn.execute(sql_create_table_index) 2013 2014 def drop_indexes(self) -> None: 2015 """ 2016 Create indexes on the table after insertion 2017 """ 2018 2019 # Access 2020 access = self.get_config().get("access", None) 2021 2022 # get table variants 2023 table_variants = self.get_table_variants("FROM") 2024 2025 # Get database format 2026 connexion_format = self.get_connexion_format() 2027 2028 if access not in ["RO"]: 2029 if connexion_format in ["duckdb"]: 2030 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2031 elif connexion_format in ["sqlite"]: 2032 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2033 2034 list_indexes = self.conn.execute(sql_list_indexes) 2035 index_names = [row[0] for row in list_indexes.fetchall()] 2036 for index in index_names: 2037 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2038 self.conn.execute(sql_drop_table_index) 2039 2040 def read_vcf_header(self, f) -> list: 2041 """ 2042 It reads the header of a VCF file and returns a list of the header lines 2043 2044 :param f: the file object 2045 :return: The header lines of the VCF file. 2046 """ 2047 2048 header_list = [] 2049 for line in f: 2050 header_list.append(line) 2051 if line.startswith("#CHROM"): 2052 break 2053 return header_list 2054 2055 def read_vcf_header_file(self, file: str = None) -> list: 2056 """ 2057 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2058 uncompressed files. 2059 2060 :param file: The `file` parameter is a string that represents the path to the VCF header file 2061 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2062 default to `None` 2063 :type file: str 2064 :return: The function `read_vcf_header_file` returns a list. 2065 """ 2066 2067 if self.get_input_compressed(input_file=file): 2068 with bgzf.open(file, "rt") as f: 2069 return self.read_vcf_header(f=f) 2070 else: 2071 with open(file, "rt") as f: 2072 return self.read_vcf_header(f=f) 2073 2074 def execute_query(self, query: str): 2075 """ 2076 It takes a query as an argument, executes it, and returns the results 2077 2078 :param query: The query to be executed 2079 :return: The result of the query is being returned. 2080 """ 2081 if query: 2082 return self.conn.execute(query) # .fetchall() 2083 else: 2084 return None 2085 2086 def export_output( 2087 self, 2088 output_file: str | None = None, 2089 output_header: str | None = None, 2090 export_header: bool = True, 2091 query: str | None = None, 2092 parquet_partitions: list | None = None, 2093 chunk_size: int | None = None, 2094 threads: int | None = None, 2095 sort: bool = False, 2096 index: bool = False, 2097 order_by: str | None = None, 2098 ) -> bool: 2099 """ 2100 The `export_output` function exports data from a VCF file to a specified output file in various 2101 formats, including VCF, CSV, TSV, PSV, and Parquet. 2102 2103 :param output_file: The `output_file` parameter is a string that specifies the name of the 2104 output file to be generated by the function. This is where the exported data will be saved 2105 :type output_file: str 2106 :param output_header: The `output_header` parameter is a string that specifies the name of the 2107 file where the header of the VCF file will be exported. If this parameter is not provided, the 2108 header will be exported to a file with the same name as the `output_file` parameter, but with 2109 the extension " 2110 :type output_header: str 2111 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2112 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2113 True, the header will be exported to a file. If `export_header` is False, the header will not 2114 be, defaults to True, if output format is not VCF 2115 :type export_header: bool (optional) 2116 :param query: The `query` parameter is an optional SQL query that can be used to filter and 2117 select specific data from the VCF file before exporting it. If provided, only the data that 2118 matches the query will be exported 2119 :type query: str 2120 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2121 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2122 organize data in a hierarchical directory structure based on the values of one or more columns. 2123 This can improve query performance when working with large datasets 2124 :type parquet_partitions: list 2125 :param chunk_size: The `chunk_size` parameter specifies the number of 2126 records in batch when exporting data in Parquet format. This parameter is used for 2127 partitioning the Parquet file into multiple files. 2128 :type chunk_size: int 2129 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2130 threads to be used during the export process. It determines the level of parallelism and can 2131 improve the performance of the export operation. If not provided, the function will use the 2132 default number of threads 2133 :type threads: int 2134 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2135 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2136 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2137 False 2138 :type sort: bool (optional) 2139 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2140 created on the output file. If `index` is True, an index will be created. If `index` is False, 2141 no index will be created. The default value is False, defaults to False 2142 :type index: bool (optional) 2143 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2144 sorting the output file. This parameter is only applicable when exporting data in VCF format 2145 :type order_by: str 2146 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2147 None if it doesn't. 2148 """ 2149 2150 # Log 2151 log.info("Exporting...") 2152 2153 # Full path 2154 output_file = full_path(output_file) 2155 output_header = full_path(output_header) 2156 2157 # Config 2158 config = self.get_config() 2159 2160 # Param 2161 param = self.get_param() 2162 2163 # Tmp files to remove 2164 tmp_to_remove = [] 2165 2166 # If no output, get it 2167 if not output_file: 2168 output_file = self.get_output() 2169 2170 # If not threads 2171 if not threads: 2172 threads = self.get_threads() 2173 2174 # Auto header name with extension 2175 if export_header or output_header: 2176 if not output_header: 2177 output_header = f"{output_file}.hdr" 2178 # Export header 2179 self.export_header(output_file=output_file) 2180 2181 # Switch off export header if VCF output 2182 output_file_type = get_file_format(output_file) 2183 if output_file_type in ["vcf"]: 2184 export_header = False 2185 tmp_to_remove.append(output_header) 2186 2187 # Chunk size 2188 if not chunk_size: 2189 chunk_size = config.get("chunk_size", None) 2190 2191 # Parquet partition 2192 if not parquet_partitions: 2193 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2194 if parquet_partitions and isinstance(parquet_partitions, str): 2195 parquet_partitions = parquet_partitions.split(",") 2196 2197 # Order by 2198 if not order_by: 2199 order_by = param.get("export", {}).get("order_by", "") 2200 2201 # Header in output 2202 header_in_output = param.get("export", {}).get("include_header", False) 2203 2204 # Database 2205 database_source = self.get_connexion() 2206 2207 # Connexion format 2208 connexion_format = self.get_connexion_format() 2209 2210 # Explode infos 2211 if self.get_explode_infos(): 2212 self.explode_infos( 2213 prefix=self.get_explode_infos_prefix(), 2214 fields=self.get_explode_infos_fields(), 2215 force=False, 2216 ) 2217 2218 # if connexion_format in ["sqlite"] or query: 2219 if connexion_format in ["sqlite"]: 2220 2221 # Export in Parquet 2222 random_tmp = "".join( 2223 random.choice(string.ascii_lowercase) for i in range(10) 2224 ) 2225 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2226 tmp_to_remove.append(database_source) 2227 2228 # Table Variants 2229 table_variants = self.get_table_variants() 2230 2231 # Create export query 2232 sql_query_export_subquery = f""" 2233 SELECT * FROM {table_variants} 2234 """ 2235 2236 # Write source file 2237 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2238 2239 # Create database 2240 database = Database( 2241 database=database_source, 2242 table="variants", 2243 header_file=output_header, 2244 conn_config=self.get_connexion_config(), 2245 ) 2246 2247 # Existing colomns header 2248 existing_columns_header = database.get_header_columns_from_database(query=query) 2249 2250 # Sample list 2251 if output_file_type in ["vcf"]: 2252 get_samples = self.get_samples() 2253 get_samples_check = self.get_samples_check() 2254 samples_force = get_samples is not None 2255 sample_list = self.get_header_sample_list( 2256 check=get_samples_check, 2257 samples=get_samples, 2258 samples_force=samples_force, 2259 ) 2260 else: 2261 sample_list = None 2262 2263 # Export file 2264 database.export( 2265 output_database=output_file, 2266 output_header=output_header, 2267 existing_columns_header=existing_columns_header, 2268 parquet_partitions=parquet_partitions, 2269 chunk_size=chunk_size, 2270 threads=threads, 2271 sort=sort, 2272 index=index, 2273 header_in_output=header_in_output, 2274 order_by=order_by, 2275 query=query, 2276 export_header=export_header, 2277 sample_list=sample_list, 2278 ) 2279 2280 # Remove 2281 remove_if_exists(tmp_to_remove) 2282 2283 return (os.path.exists(output_file) or None) and ( 2284 os.path.exists(output_file) or None 2285 ) 2286 2287 def get_extra_infos(self, table: str = None) -> list: 2288 """ 2289 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2290 in the header. 2291 2292 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2293 name of the table from which you want to retrieve the extra columns that are not present in the 2294 header. If the `table` parameter is not provided when calling the function, it will default to 2295 using the variants 2296 :type table: str 2297 :return: A list of columns that are in the specified table but not in the header of the table. 2298 """ 2299 2300 header_columns = [] 2301 2302 if not table: 2303 table = self.get_table_variants(clause="from") 2304 header_columns = self.get_header_columns() 2305 2306 # Check all columns in the database 2307 query = f""" SELECT * FROM {table} LIMIT 1 """ 2308 log.debug(f"query {query}") 2309 table_columns = self.get_query_to_df(query).columns.tolist() 2310 extra_columns = [] 2311 2312 # Construct extra infos (not in header) 2313 for column in table_columns: 2314 if column not in header_columns: 2315 extra_columns.append(column) 2316 2317 return extra_columns 2318 2319 def get_extra_infos_sql(self, table: str = None) -> str: 2320 """ 2321 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2322 by double quotes 2323 2324 :param table: The name of the table to get the extra infos from. If None, the default table is 2325 used 2326 :type table: str 2327 :return: A string of the extra infos 2328 """ 2329 2330 return ", ".join( 2331 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2332 ) 2333 2334 def export_header( 2335 self, 2336 header_name: str = None, 2337 output_file: str = None, 2338 output_file_ext: str = ".hdr", 2339 clean_header: bool = True, 2340 remove_chrom_line: bool = False, 2341 ) -> str: 2342 """ 2343 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2344 specified options, and writes it to a new file. 2345 2346 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2347 this parameter is not specified, the header will be written to the output file 2348 :type header_name: str 2349 :param output_file: The `output_file` parameter in the `export_header` function is used to 2350 specify the name of the output file where the header will be written. If this parameter is not 2351 provided, the header will be written to a temporary file 2352 :type output_file: str 2353 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2354 string that represents the extension of the output header file. By default, it is set to ".hdr" 2355 if not specified by the user. This extension will be appended to the `output_file` name to 2356 create the final, defaults to .hdr 2357 :type output_file_ext: str (optional) 2358 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2359 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2360 `True`, the function will clean the header by modifying certain lines based on a specific 2361 pattern. If `clean_header`, defaults to True 2362 :type clean_header: bool (optional) 2363 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2364 boolean flag that determines whether the #CHROM line should be removed from the header before 2365 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2366 defaults to False 2367 :type remove_chrom_line: bool (optional) 2368 :return: The function `export_header` returns the name of the temporary header file that is 2369 created. 2370 """ 2371 2372 if not header_name and not output_file: 2373 output_file = self.get_output() 2374 2375 if self.get_header(): 2376 2377 # Get header object 2378 header_obj = self.get_header() 2379 2380 # Create database 2381 db_for_header = Database(database=self.get_input()) 2382 2383 # Get real columns in the file 2384 db_header_columns = db_for_header.get_columns() 2385 2386 with tempfile.TemporaryDirectory() as tmpdir: 2387 2388 # Write header file 2389 header_file_tmp = os.path.join(tmpdir, "header") 2390 f = open(header_file_tmp, "w") 2391 vcf.Writer(f, header_obj) 2392 f.close() 2393 2394 # Replace #CHROM line with rel columns 2395 header_list = db_for_header.read_header_file( 2396 header_file=header_file_tmp 2397 ) 2398 header_list[-1] = "\t".join(db_header_columns) 2399 2400 # Remove CHROM line 2401 if remove_chrom_line: 2402 header_list.pop() 2403 2404 # Clean header 2405 if clean_header: 2406 header_list_clean = [] 2407 for head in header_list: 2408 # Clean head for malformed header 2409 head_clean = head 2410 head_clean = re.subn( 2411 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2412 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2413 head_clean, 2414 2, 2415 )[0] 2416 # Write header 2417 header_list_clean.append(head_clean) 2418 header_list = header_list_clean 2419 2420 tmp_header_name = output_file + output_file_ext 2421 2422 f = open(tmp_header_name, "w") 2423 for line in header_list: 2424 f.write(line) 2425 f.close() 2426 2427 return tmp_header_name 2428 2429 def export_variant_vcf( 2430 self, 2431 vcf_file, 2432 remove_info: bool = False, 2433 add_samples: bool = True, 2434 list_samples: list = [], 2435 where_clause: str = "", 2436 index: bool = False, 2437 threads: int | None = None, 2438 ) -> bool | None: 2439 """ 2440 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2441 remove INFO field, add samples, and control compression and indexing. 2442 2443 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2444 written to. It is the output file that will contain the filtered VCF data based on the specified 2445 parameters 2446 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2447 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2448 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2449 in, defaults to False 2450 :type remove_info: bool (optional) 2451 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2452 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2453 If set to False, the samples will be removed. The default value is True, defaults to True 2454 :type add_samples: bool (optional) 2455 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2456 in the output VCF file. By default, all samples will be included. If you provide a list of 2457 samples, only those samples will be included in the output file 2458 :type list_samples: list 2459 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2460 determines whether or not to create an index for the output VCF file. If `index` is set to 2461 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2462 :type index: bool (optional) 2463 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2464 number of threads to use for exporting the VCF file. It determines how many parallel threads 2465 will be used during the export process. More threads can potentially speed up the export process 2466 by utilizing multiple cores of the processor. If 2467 :type threads: int | None 2468 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2469 method with various parameters including the output file, query, threads, sort flag, and index 2470 flag. The `export_output` method is responsible for exporting the VCF data based on the 2471 specified parameters and configurations provided in the `export_variant_vcf` function. 2472 """ 2473 2474 # Config 2475 config = self.get_config() 2476 2477 # Extract VCF 2478 log.debug("Export VCF...") 2479 2480 # Table variants 2481 table_variants = self.get_table_variants() 2482 2483 # Threads 2484 if not threads: 2485 threads = self.get_threads() 2486 2487 # Info fields 2488 if remove_info: 2489 if not isinstance(remove_info, str): 2490 remove_info = "." 2491 info_field = f"""'{remove_info}' as INFO""" 2492 else: 2493 info_field = "INFO" 2494 2495 # Samples fields 2496 if add_samples: 2497 if not list_samples: 2498 list_samples = self.get_header_sample_list() 2499 if list_samples: 2500 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2501 else: 2502 samples_fields = "" 2503 log.debug(f"samples_fields: {samples_fields}") 2504 else: 2505 samples_fields = "" 2506 2507 # Where clause 2508 if where_clause is None: 2509 where_clause = "" 2510 2511 # Variants 2512 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2513 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2514 log.debug(f"sql_query_select={sql_query_select}") 2515 2516 return self.export_output( 2517 output_file=vcf_file, 2518 output_header=None, 2519 export_header=True, 2520 query=sql_query_select, 2521 parquet_partitions=None, 2522 chunk_size=config.get("chunk_size", None), 2523 threads=threads, 2524 sort=True, 2525 index=index, 2526 order_by=None, 2527 ) 2528 2529 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2530 """ 2531 It takes a list of commands and runs them in parallel using the number of threads specified 2532 2533 :param commands: A list of commands to run 2534 :param threads: The number of threads to use, defaults to 1 (optional) 2535 """ 2536 2537 run_parallel_commands(commands, threads) 2538 2539 def get_threads(self, default: int = 1) -> int: 2540 """ 2541 This function returns the number of threads to use for a job, with a default value of 1 if not 2542 specified. 2543 2544 :param default: The `default` parameter in the `get_threads` method is used to specify the 2545 default number of threads to use if no specific value is provided. If no value is provided for 2546 the `threads` parameter in the configuration or input parameters, the `default` value will be 2547 used, defaults to 1 2548 :type default: int (optional) 2549 :return: the number of threads to use for the current job. 2550 """ 2551 2552 # Config 2553 config = self.get_config() 2554 2555 # Param 2556 param = self.get_param() 2557 2558 # Input threads 2559 input_thread = param.get("threads", config.get("threads", None)) 2560 2561 # Check threads 2562 if not input_thread: 2563 threads = default 2564 elif int(input_thread) <= 0: 2565 threads = os.cpu_count() 2566 else: 2567 threads = int(input_thread) 2568 return threads 2569 2570 def get_memory(self, default: str = None) -> str: 2571 """ 2572 This function retrieves the memory value from parameters or configuration with a default value 2573 if not found. 2574 2575 :param default: The `get_memory` function takes in a default value as a string parameter. This 2576 default value is used as a fallback in case the `memory` parameter is not provided in the 2577 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2578 the function 2579 :type default: str 2580 :return: The `get_memory` function returns a string value representing the memory parameter. If 2581 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2582 return the default value provided as an argument to the function. 2583 """ 2584 2585 # Config 2586 config = self.get_config() 2587 2588 # Param 2589 param = self.get_param() 2590 2591 # Input threads 2592 input_memory = param.get("memory", config.get("memory", None)) 2593 2594 # Check threads 2595 if input_memory: 2596 memory = input_memory 2597 else: 2598 memory = default 2599 2600 return memory 2601 2602 def update_from_vcf(self, vcf_file: str) -> None: 2603 """ 2604 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2605 2606 :param vcf_file: the path to the VCF file 2607 """ 2608 2609 connexion_format = self.get_connexion_format() 2610 2611 if connexion_format in ["duckdb"]: 2612 self.update_from_vcf_duckdb(vcf_file) 2613 elif connexion_format in ["sqlite"]: 2614 self.update_from_vcf_sqlite(vcf_file) 2615 2616 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2617 """ 2618 It takes a VCF file and updates the INFO column of the variants table in the database with the 2619 INFO column of the VCF file 2620 2621 :param vcf_file: the path to the VCF file 2622 """ 2623 2624 # varaints table 2625 table_variants = self.get_table_variants() 2626 2627 # Loading VCF into temporaire table 2628 skip = self.get_header_length(file=vcf_file) 2629 vcf_df = pd.read_csv( 2630 vcf_file, 2631 sep="\t", 2632 engine="c", 2633 skiprows=skip, 2634 header=0, 2635 low_memory=False, 2636 ) 2637 sql_query_update = f""" 2638 UPDATE {table_variants} as table_variants 2639 SET INFO = concat( 2640 CASE 2641 WHEN INFO NOT IN ('', '.') 2642 THEN INFO 2643 ELSE '' 2644 END, 2645 ( 2646 SELECT 2647 concat( 2648 CASE 2649 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2650 THEN ';' 2651 ELSE '' 2652 END 2653 , 2654 CASE 2655 WHEN table_parquet.INFO NOT IN ('','.') 2656 THEN table_parquet.INFO 2657 ELSE '' 2658 END 2659 ) 2660 FROM vcf_df as table_parquet 2661 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2662 AND table_parquet.\"POS\" = table_variants.\"POS\" 2663 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2664 AND table_parquet.\"REF\" = table_variants.\"REF\" 2665 AND table_parquet.INFO NOT IN ('','.') 2666 ) 2667 ) 2668 ; 2669 """ 2670 self.conn.execute(sql_query_update) 2671 2672 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2673 """ 2674 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2675 table, then updates the INFO column of the variants table with the INFO column of the temporary 2676 table 2677 2678 :param vcf_file: The path to the VCF file you want to update the database with 2679 """ 2680 2681 # Create a temporary table for the VCF 2682 table_vcf = "tmp_vcf" 2683 sql_create = ( 2684 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2685 ) 2686 self.conn.execute(sql_create) 2687 2688 # Loading VCF into temporaire table 2689 vcf_df = pd.read_csv( 2690 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2691 ) 2692 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2693 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2694 2695 # Update table 'variants' with VCF data 2696 # warning: CONCAT as || operator 2697 sql_query_update = f""" 2698 UPDATE variants as table_variants 2699 SET INFO = CASE 2700 WHEN INFO NOT IN ('', '.') 2701 THEN INFO 2702 ELSE '' 2703 END || 2704 ( 2705 SELECT 2706 CASE 2707 WHEN table_variants.INFO NOT IN ('','.') 2708 AND table_vcf.INFO NOT IN ('','.') 2709 THEN ';' 2710 ELSE '' 2711 END || 2712 CASE 2713 WHEN table_vcf.INFO NOT IN ('','.') 2714 THEN table_vcf.INFO 2715 ELSE '' 2716 END 2717 FROM {table_vcf} as table_vcf 2718 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2719 AND table_vcf.\"POS\" = table_variants.\"POS\" 2720 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2721 AND table_vcf.\"REF\" = table_variants.\"REF\" 2722 ) 2723 """ 2724 self.conn.execute(sql_query_update) 2725 2726 # Drop temporary table 2727 sql_drop = f"DROP TABLE {table_vcf}" 2728 self.conn.execute(sql_drop) 2729 2730 def drop_variants_table(self) -> None: 2731 """ 2732 > This function drops the variants table 2733 """ 2734 2735 table_variants = self.get_table_variants() 2736 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2737 self.conn.execute(sql_table_variants) 2738 2739 def set_variant_id( 2740 self, variant_id_column: str = "variant_id", force: bool = None 2741 ) -> str: 2742 """ 2743 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2744 `#CHROM`, `POS`, `REF`, and `ALT` columns 2745 2746 :param variant_id_column: The name of the column to be created in the variants table, defaults 2747 to variant_id 2748 :type variant_id_column: str (optional) 2749 :param force: If True, the variant_id column will be created even if it already exists 2750 :type force: bool 2751 :return: The name of the column that contains the variant_id 2752 """ 2753 2754 # Assembly 2755 assembly = self.get_param().get( 2756 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2757 ) 2758 2759 # INFO/Tag prefix 2760 prefix = self.get_explode_infos_prefix() 2761 2762 # Explode INFO/SVTYPE 2763 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2764 2765 # variants table 2766 table_variants = self.get_table_variants() 2767 2768 # variant_id column 2769 if not variant_id_column: 2770 variant_id_column = "variant_id" 2771 2772 # Creta variant_id column 2773 if "variant_id" not in self.get_extra_infos() or force: 2774 2775 # Create column 2776 self.add_column( 2777 table_name=table_variants, 2778 column_name=variant_id_column, 2779 column_type="UBIGINT", 2780 default_value="0", 2781 ) 2782 2783 # Update column 2784 self.conn.execute( 2785 f""" 2786 UPDATE {table_variants} 2787 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2788 """ 2789 ) 2790 2791 # Remove added columns 2792 for added_column in added_columns: 2793 self.drop_column(column=added_column) 2794 2795 # return variant_id column name 2796 return variant_id_column 2797 2798 def get_variant_id_column( 2799 self, variant_id_column: str = "variant_id", force: bool = None 2800 ) -> str: 2801 """ 2802 This function returns the variant_id column name 2803 2804 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2805 defaults to variant_id 2806 :type variant_id_column: str (optional) 2807 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2808 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2809 if it is not already set, or if it is set 2810 :type force: bool 2811 :return: The variant_id column name. 2812 """ 2813 2814 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2815 2816 ### 2817 # Annotation 2818 ### 2819 2820 def scan_databases( 2821 self, 2822 database_formats: list = ["parquet"], 2823 database_releases: list = ["current"], 2824 ) -> dict: 2825 """ 2826 The function `scan_databases` scans for available databases based on specified formats and 2827 releases. 2828 2829 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2830 of the databases to be scanned. In this case, the accepted format is "parquet" 2831 :type database_formats: list ["parquet"] 2832 :param database_releases: The `database_releases` parameter is a list that specifies the 2833 releases of the databases to be scanned. In the provided function, the default value for 2834 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2835 databases that are in the "current" 2836 :type database_releases: list 2837 :return: The function `scan_databases` returns a dictionary containing information about 2838 databases that match the specified formats and releases. 2839 """ 2840 2841 # Config 2842 config = self.get_config() 2843 2844 # Param 2845 param = self.get_param() 2846 2847 # Param - Assembly 2848 assembly = param.get("assembly", config.get("assembly", None)) 2849 if not assembly: 2850 assembly = DEFAULT_ASSEMBLY 2851 log.warning(f"Default assembly '{assembly}'") 2852 2853 # Scan for availabled databases 2854 log.info( 2855 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2856 ) 2857 databases_infos_dict = databases_infos( 2858 database_folder_releases=database_releases, 2859 database_formats=database_formats, 2860 assembly=assembly, 2861 config=config, 2862 ) 2863 log.info( 2864 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2865 ) 2866 2867 return databases_infos_dict 2868 2869 def annotation(self) -> None: 2870 """ 2871 It annotates the VCF file with the annotations specified in the config file. 2872 """ 2873 2874 # Config 2875 config = self.get_config() 2876 2877 # Param 2878 param = self.get_param() 2879 2880 # Param - Assembly 2881 assembly = param.get("assembly", config.get("assembly", None)) 2882 if not assembly: 2883 assembly = DEFAULT_ASSEMBLY 2884 log.warning(f"Default assembly '{assembly}'") 2885 2886 # annotations databases folders 2887 annotations_databases = set( 2888 config.get("folders", {}) 2889 .get("databases", {}) 2890 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2891 + config.get("folders", {}) 2892 .get("databases", {}) 2893 .get("parquet", ["~/howard/databases/parquet/current"]) 2894 + config.get("folders", {}) 2895 .get("databases", {}) 2896 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2897 ) 2898 2899 # Get param annotations 2900 if param.get("annotations", None) and isinstance( 2901 param.get("annotations", None), str 2902 ): 2903 log.debug(param.get("annotations", None)) 2904 param_annotation_list = param.get("annotations").split(",") 2905 else: 2906 param_annotation_list = [] 2907 2908 # Each tools param 2909 if param.get("annotation_parquet", None) != None: 2910 log.debug( 2911 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2912 ) 2913 if isinstance(param.get("annotation_parquet", None), list): 2914 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2915 else: 2916 param_annotation_list.append(param.get("annotation_parquet")) 2917 if param.get("annotation_snpsift", None) != None: 2918 if isinstance(param.get("annotation_snpsift", None), list): 2919 param_annotation_list.append( 2920 "snpsift:" 2921 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2922 ) 2923 else: 2924 param_annotation_list.append( 2925 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2926 ) 2927 if param.get("annotation_snpeff", None) != None: 2928 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2929 if param.get("annotation_bcftools", None) != None: 2930 if isinstance(param.get("annotation_bcftools", None), list): 2931 param_annotation_list.append( 2932 "bcftools:" 2933 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2934 ) 2935 else: 2936 param_annotation_list.append( 2937 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2938 ) 2939 if param.get("annotation_annovar", None) != None: 2940 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2941 if param.get("annotation_exomiser", None) != None: 2942 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2943 if param.get("annotation_splice", None) != None: 2944 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2945 2946 # Merge param annotations list 2947 param["annotations"] = ",".join(param_annotation_list) 2948 2949 # debug 2950 log.debug(f"param_annotations={param['annotations']}") 2951 2952 if param.get("annotations"): 2953 2954 # Log 2955 # log.info("Annotations - Check annotation parameters") 2956 2957 if not "annotation" in param: 2958 param["annotation"] = {} 2959 2960 # List of annotations parameters 2961 annotations_list_input = {} 2962 if isinstance(param.get("annotations", None), str): 2963 annotation_file_list = [ 2964 value for value in param.get("annotations", "").split(",") 2965 ] 2966 for annotation_file in annotation_file_list: 2967 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2968 else: 2969 annotations_list_input = param.get("annotations", {}) 2970 2971 log.info(f"Quick Annotations:") 2972 for annotation_key in list(annotations_list_input.keys()): 2973 log.info(f" {annotation_key}") 2974 2975 # List of annotations and associated fields 2976 annotations_list = {} 2977 2978 for annotation_file in annotations_list_input: 2979 2980 # Explode annotations if ALL 2981 if ( 2982 annotation_file.upper() == "ALL" 2983 or annotation_file.upper().startswith("ALL:") 2984 ): 2985 2986 # check ALL parameters (formats, releases) 2987 annotation_file_split = annotation_file.split(":") 2988 database_formats = "parquet" 2989 database_releases = "current" 2990 for annotation_file_option in annotation_file_split[1:]: 2991 database_all_options_split = annotation_file_option.split("=") 2992 if database_all_options_split[0] == "format": 2993 database_formats = database_all_options_split[1].split("+") 2994 if database_all_options_split[0] == "release": 2995 database_releases = database_all_options_split[1].split("+") 2996 2997 # Scan for availabled databases 2998 databases_infos_dict = self.scan_databases( 2999 database_formats=database_formats, 3000 database_releases=database_releases, 3001 ) 3002 3003 # Add found databases in annotation parameters 3004 for database_infos in databases_infos_dict.keys(): 3005 annotations_list[database_infos] = {"INFO": None} 3006 3007 else: 3008 annotations_list[annotation_file] = annotations_list_input[ 3009 annotation_file 3010 ] 3011 3012 # Check each databases 3013 if len(annotations_list): 3014 3015 log.info( 3016 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3017 ) 3018 3019 for annotation_file in annotations_list: 3020 3021 # Init 3022 annotations = annotations_list.get(annotation_file, None) 3023 3024 # Annotation snpEff 3025 if annotation_file.startswith("snpeff"): 3026 3027 log.debug(f"Quick Annotation snpEff") 3028 3029 if "snpeff" not in param["annotation"]: 3030 param["annotation"]["snpeff"] = {} 3031 3032 if "options" not in param["annotation"]["snpeff"]: 3033 param["annotation"]["snpeff"]["options"] = "" 3034 3035 # snpEff options in annotations 3036 param["annotation"]["snpeff"]["options"] = "".join( 3037 annotation_file.split(":")[1:] 3038 ) 3039 3040 # Annotation Annovar 3041 elif annotation_file.startswith("annovar"): 3042 3043 log.debug(f"Quick Annotation Annovar") 3044 3045 if "annovar" not in param["annotation"]: 3046 param["annotation"]["annovar"] = {} 3047 3048 if "annotations" not in param["annotation"]["annovar"]: 3049 param["annotation"]["annovar"]["annotations"] = {} 3050 3051 # Options 3052 annotation_file_split = annotation_file.split(":") 3053 for annotation_file_annotation in annotation_file_split[1:]: 3054 if annotation_file_annotation: 3055 param["annotation"]["annovar"]["annotations"][ 3056 annotation_file_annotation 3057 ] = annotations 3058 3059 # Annotation Exomiser 3060 elif annotation_file.startswith("exomiser"): 3061 3062 log.debug(f"Quick Annotation Exomiser") 3063 3064 param["annotation"]["exomiser"] = params_string_to_dict( 3065 annotation_file 3066 ) 3067 3068 # Annotation Splice 3069 elif annotation_file.startswith("splice"): 3070 3071 log.debug(f"Quick Annotation Splice") 3072 3073 param["annotation"]["splice"] = params_string_to_dict( 3074 annotation_file 3075 ) 3076 3077 # Annotation Parquet or BCFTOOLS 3078 else: 3079 3080 # Tools detection 3081 if annotation_file.startswith("bcftools:"): 3082 annotation_tool_initial = "bcftools" 3083 annotation_file = ":".join(annotation_file.split(":")[1:]) 3084 elif annotation_file.startswith("snpsift:"): 3085 annotation_tool_initial = "snpsift" 3086 annotation_file = ":".join(annotation_file.split(":")[1:]) 3087 elif annotation_file.startswith("bigwig:"): 3088 annotation_tool_initial = "bigwig" 3089 annotation_file = ":".join(annotation_file.split(":")[1:]) 3090 else: 3091 annotation_tool_initial = None 3092 3093 # list of files 3094 annotation_file_list = annotation_file.replace("+", ":").split( 3095 ":" 3096 ) 3097 3098 for annotation_file in annotation_file_list: 3099 3100 if annotation_file: 3101 3102 # Annotation tool initial 3103 annotation_tool = annotation_tool_initial 3104 3105 # Find file 3106 annotation_file_found = None 3107 3108 if os.path.exists(annotation_file): 3109 annotation_file_found = annotation_file 3110 elif os.path.exists(full_path(annotation_file)): 3111 annotation_file_found = full_path(annotation_file) 3112 else: 3113 # Find within assembly folders 3114 for annotations_database in annotations_databases: 3115 found_files = find_all( 3116 annotation_file, 3117 os.path.join( 3118 annotations_database, assembly 3119 ), 3120 ) 3121 if len(found_files) > 0: 3122 annotation_file_found = found_files[0] 3123 break 3124 if not annotation_file_found and not assembly: 3125 # Find within folders 3126 for ( 3127 annotations_database 3128 ) in annotations_databases: 3129 found_files = find_all( 3130 annotation_file, annotations_database 3131 ) 3132 if len(found_files) > 0: 3133 annotation_file_found = found_files[0] 3134 break 3135 log.debug( 3136 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3137 ) 3138 3139 # Full path 3140 annotation_file_found = full_path(annotation_file_found) 3141 3142 if annotation_file_found: 3143 3144 database = Database(database=annotation_file_found) 3145 quick_annotation_format = database.get_format() 3146 quick_annotation_is_compressed = ( 3147 database.is_compressed() 3148 ) 3149 quick_annotation_is_indexed = os.path.exists( 3150 f"{annotation_file_found}.tbi" 3151 ) 3152 bcftools_preference = False 3153 3154 # Check Annotation Tool 3155 if not annotation_tool: 3156 if ( 3157 bcftools_preference 3158 and quick_annotation_format 3159 in ["vcf", "bed"] 3160 and quick_annotation_is_compressed 3161 and quick_annotation_is_indexed 3162 ): 3163 annotation_tool = "bcftools" 3164 elif quick_annotation_format in [ 3165 "vcf", 3166 "bed", 3167 "tsv", 3168 "tsv", 3169 "csv", 3170 "json", 3171 "tbl", 3172 "parquet", 3173 "duckdb", 3174 ]: 3175 annotation_tool = "parquet" 3176 elif quick_annotation_format in ["bw"]: 3177 annotation_tool = "bigwig" 3178 else: 3179 log.error( 3180 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3181 ) 3182 raise ValueError( 3183 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3184 ) 3185 3186 log.debug( 3187 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3188 ) 3189 3190 # Annotation Tool dispatch 3191 if annotation_tool: 3192 if annotation_tool not in param["annotation"]: 3193 param["annotation"][annotation_tool] = {} 3194 if ( 3195 "annotations" 3196 not in param["annotation"][annotation_tool] 3197 ): 3198 param["annotation"][annotation_tool][ 3199 "annotations" 3200 ] = {} 3201 param["annotation"][annotation_tool][ 3202 "annotations" 3203 ][annotation_file_found] = annotations 3204 3205 else: 3206 log.warning( 3207 f"Quick Annotation File {annotation_file} does NOT exist" 3208 ) 3209 3210 self.set_param(param) 3211 3212 if param.get("annotation", None): 3213 log.info("Annotations") 3214 if param.get("annotation", {}).get("parquet", None): 3215 log.info("Annotations 'parquet'...") 3216 self.annotation_parquet() 3217 if param.get("annotation", {}).get("bcftools", None): 3218 log.info("Annotations 'bcftools'...") 3219 self.annotation_bcftools() 3220 if param.get("annotation", {}).get("snpsift", None): 3221 log.info("Annotations 'snpsift'...") 3222 self.annotation_snpsift() 3223 if param.get("annotation", {}).get("bigwig", None): 3224 log.info("Annotations 'bigwig'...") 3225 self.annotation_bigwig() 3226 if param.get("annotation", {}).get("annovar", None): 3227 log.info("Annotations 'annovar'...") 3228 self.annotation_annovar() 3229 if param.get("annotation", {}).get("snpeff", None): 3230 log.info("Annotations 'snpeff'...") 3231 self.annotation_snpeff() 3232 if param.get("annotation", {}).get("exomiser", None) is not None: 3233 log.info("Annotations 'exomiser'...") 3234 self.annotation_exomiser() 3235 if param.get("annotation", {}).get("splice", None) is not None: 3236 log.info("Annotations 'splice' ...") 3237 self.annotation_splice() 3238 3239 # Explode INFOS fields into table fields 3240 if self.get_explode_infos(): 3241 self.explode_infos( 3242 prefix=self.get_explode_infos_prefix(), 3243 fields=self.get_explode_infos_fields(), 3244 force=True, 3245 ) 3246 3247 def annotation_bigwig(self, threads: int = None) -> None: 3248 """ 3249 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3250 3251 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3252 number of threads to be used for parallel processing during the annotation process. If the 3253 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3254 threads to use based on the system configuration 3255 :type threads: int 3256 :return: True 3257 """ 3258 3259 # DEBUG 3260 log.debug("Start annotation with bigwig databases") 3261 3262 # # Threads 3263 # if not threads: 3264 # threads = self.get_threads() 3265 # log.debug("Threads: " + str(threads)) 3266 3267 # Config 3268 config = self.get_config() 3269 log.debug("Config: " + str(config)) 3270 3271 # Config - BCFTools databases folders 3272 databases_folders = set( 3273 self.get_config() 3274 .get("folders", {}) 3275 .get("databases", {}) 3276 .get("annotations", ["."]) 3277 + self.get_config() 3278 .get("folders", {}) 3279 .get("databases", {}) 3280 .get("bigwig", ["."]) 3281 ) 3282 log.debug("Databases annotations: " + str(databases_folders)) 3283 3284 # Param 3285 annotations = ( 3286 self.get_param() 3287 .get("annotation", {}) 3288 .get("bigwig", {}) 3289 .get("annotations", None) 3290 ) 3291 log.debug("Annotations: " + str(annotations)) 3292 3293 # Assembly 3294 assembly = self.get_param().get( 3295 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3296 ) 3297 3298 # Data 3299 table_variants = self.get_table_variants() 3300 3301 # Check if not empty 3302 log.debug("Check if not empty") 3303 sql_query_chromosomes = ( 3304 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3305 ) 3306 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3307 if not sql_query_chromosomes_df["count"][0]: 3308 log.info(f"VCF empty") 3309 return 3310 3311 # VCF header 3312 vcf_reader = self.get_header() 3313 log.debug("Initial header: " + str(vcf_reader.infos)) 3314 3315 # Existing annotations 3316 for vcf_annotation in self.get_header().infos: 3317 3318 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3319 log.debug( 3320 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3321 ) 3322 3323 if annotations: 3324 3325 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3326 3327 # Export VCF file 3328 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3329 3330 # annotation_bigwig_config 3331 annotation_bigwig_config_list = [] 3332 3333 for annotation in annotations: 3334 annotation_fields = annotations[annotation] 3335 3336 # Annotation Name 3337 annotation_name = os.path.basename(annotation) 3338 3339 if not annotation_fields: 3340 annotation_fields = {"INFO": None} 3341 3342 log.debug(f"Annotation '{annotation_name}'") 3343 log.debug( 3344 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3345 ) 3346 3347 # Create Database 3348 database = Database( 3349 database=annotation, 3350 databases_folders=databases_folders, 3351 assembly=assembly, 3352 ) 3353 3354 # Find files 3355 db_file = database.get_database() 3356 db_file = full_path(db_file) 3357 db_hdr_file = database.get_header_file() 3358 db_hdr_file = full_path(db_hdr_file) 3359 db_file_type = database.get_format() 3360 3361 # If db_file is http ? 3362 if database.get_database().startswith("http"): 3363 3364 # Datbase is HTTP URL 3365 db_file_is_http = True 3366 3367 # DB file keep as URL 3368 db_file = database.get_database() 3369 log.warning( 3370 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3371 ) 3372 3373 # Retrieve automatic annotation field name 3374 annotation_field = clean_annotation_field( 3375 os.path.basename(db_file).replace(".bw", "") 3376 ) 3377 log.debug( 3378 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3379 ) 3380 3381 # Create automatic header file 3382 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3383 with open(db_hdr_file, "w") as f: 3384 f.write("##fileformat=VCFv4.2\n") 3385 f.write( 3386 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3387 ) 3388 f.write(f"#CHROM START END {annotation_field}\n") 3389 3390 else: 3391 3392 # Datbase is NOT HTTP URL 3393 db_file_is_http = False 3394 3395 # Check index - try to create if not exists 3396 if ( 3397 db_file is None 3398 or db_hdr_file is None 3399 or (not os.path.exists(db_file) and not db_file_is_http) 3400 or not os.path.exists(db_hdr_file) 3401 or not db_file_type in ["bw"] 3402 ): 3403 # if False: 3404 log.error("Annotation failed: database not valid") 3405 log.error(f"Annotation annotation file: {db_file}") 3406 log.error(f"Annotation annotation file type: {db_file_type}") 3407 log.error(f"Annotation annotation header: {db_hdr_file}") 3408 raise ValueError( 3409 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3410 ) 3411 else: 3412 3413 # Log 3414 log.debug( 3415 f"Annotation '{annotation}' - file: " 3416 + str(db_file) 3417 + " and " 3418 + str(db_hdr_file) 3419 ) 3420 3421 # Load header as VCF object 3422 db_hdr_vcf = Variants(input=db_hdr_file) 3423 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3424 log.debug( 3425 "Annotation database header: " 3426 + str(db_hdr_vcf_header_infos) 3427 ) 3428 3429 # For all fields in database 3430 annotation_fields_full = False 3431 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3432 annotation_fields = { 3433 key: key for key in db_hdr_vcf_header_infos 3434 } 3435 log.debug( 3436 "Annotation database header - All annotations added: " 3437 + str(annotation_fields) 3438 ) 3439 annotation_fields_full = True 3440 3441 # Init 3442 cyvcf2_header_rename_dict = {} 3443 cyvcf2_header_list = [] 3444 cyvcf2_header_indexes = {} 3445 3446 # process annotation fields 3447 for annotation_field in annotation_fields: 3448 3449 # New annotation name 3450 annotation_field_new = annotation_fields[annotation_field] 3451 3452 # Check annotation field and index in header 3453 if ( 3454 annotation_field 3455 in db_hdr_vcf.get_header_columns_as_list() 3456 ): 3457 annotation_field_index = ( 3458 db_hdr_vcf.get_header_columns_as_list().index( 3459 annotation_field 3460 ) 3461 - 3 3462 ) 3463 cyvcf2_header_indexes[annotation_field_new] = ( 3464 annotation_field_index 3465 ) 3466 else: 3467 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3468 log.error(msg_err) 3469 raise ValueError(msg_err) 3470 3471 # Append annotation field in cyvcf2 header list 3472 cyvcf2_header_rename_dict[annotation_field_new] = ( 3473 db_hdr_vcf_header_infos[annotation_field].id 3474 ) 3475 cyvcf2_header_list.append( 3476 { 3477 "ID": annotation_field_new, 3478 "Number": db_hdr_vcf_header_infos[ 3479 annotation_field 3480 ].num, 3481 "Type": db_hdr_vcf_header_infos[ 3482 annotation_field 3483 ].type, 3484 "Description": db_hdr_vcf_header_infos[ 3485 annotation_field 3486 ].desc, 3487 } 3488 ) 3489 3490 # Add header on VCF 3491 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3492 annotation_field_new, 3493 db_hdr_vcf_header_infos[annotation_field].num, 3494 db_hdr_vcf_header_infos[annotation_field].type, 3495 db_hdr_vcf_header_infos[annotation_field].desc, 3496 "HOWARD BigWig annotation", 3497 "unknown", 3498 self.code_type_map[ 3499 db_hdr_vcf_header_infos[annotation_field].type 3500 ], 3501 ) 3502 3503 # Load bigwig database 3504 bw_db = pyBigWig.open(db_file) 3505 if bw_db.isBigWig(): 3506 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3507 else: 3508 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3509 log.error(msg_err) 3510 raise ValueError(msg_err) 3511 3512 annotation_bigwig_config_list.append( 3513 { 3514 "db_file": db_file, 3515 "bw_db": bw_db, 3516 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3517 "cyvcf2_header_list": cyvcf2_header_list, 3518 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3519 } 3520 ) 3521 3522 # Annotate 3523 if annotation_bigwig_config_list: 3524 3525 # Annotation config 3526 log.debug( 3527 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3528 ) 3529 3530 # Export VCF file 3531 self.export_variant_vcf( 3532 vcf_file=tmp_vcf_name, 3533 remove_info=True, 3534 add_samples=False, 3535 index=True, 3536 ) 3537 3538 # Load input tmp file 3539 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3540 3541 # Add header in input file 3542 for annotation_bigwig_config in annotation_bigwig_config_list: 3543 for cyvcf2_header_field in annotation_bigwig_config.get( 3544 "cyvcf2_header_list", [] 3545 ): 3546 log.info( 3547 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3548 ) 3549 input_vcf.add_info_to_header(cyvcf2_header_field) 3550 3551 # Create output VCF file 3552 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3553 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3554 3555 # Fetch variants 3556 log.info(f"Annotations 'bigwig' start...") 3557 for variant in input_vcf: 3558 3559 for annotation_bigwig_config in annotation_bigwig_config_list: 3560 3561 # DB and indexes 3562 bw_db = annotation_bigwig_config.get("bw_db", None) 3563 cyvcf2_header_indexes = annotation_bigwig_config.get( 3564 "cyvcf2_header_indexes", None 3565 ) 3566 3567 # Retrieve value from chrom pos 3568 res = bw_db.values( 3569 variant.CHROM, variant.POS - 1, variant.POS 3570 ) 3571 3572 # For each annotation fields (and indexes) 3573 for cyvcf2_header_index in cyvcf2_header_indexes: 3574 3575 # If value is NOT nNone 3576 if not np.isnan( 3577 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3578 ): 3579 variant.INFO[cyvcf2_header_index] = res[ 3580 cyvcf2_header_indexes[cyvcf2_header_index] 3581 ] 3582 3583 # Add record in output file 3584 output_vcf.write_record(variant) 3585 3586 # Log 3587 log.debug(f"Annotation done.") 3588 3589 # Close and write file 3590 log.info(f"Annotations 'bigwig' write...") 3591 output_vcf.close() 3592 log.debug(f"Write done.") 3593 3594 # Update variants 3595 log.info(f"Annotations 'bigwig' update...") 3596 self.update_from_vcf(output_vcf_file) 3597 log.debug(f"Update done.") 3598 3599 return True 3600 3601 def annotation_snpsift(self, threads: int = None) -> None: 3602 """ 3603 This function annotate with bcftools 3604 3605 :param threads: Number of threads to use 3606 :return: the value of the variable "return_value". 3607 """ 3608 3609 # DEBUG 3610 log.debug("Start annotation with bcftools databases") 3611 3612 # Threads 3613 if not threads: 3614 threads = self.get_threads() 3615 log.debug("Threads: " + str(threads)) 3616 3617 # Config 3618 config = self.get_config() 3619 log.debug("Config: " + str(config)) 3620 3621 # Config - snpSift 3622 snpsift_bin_command = get_bin_command( 3623 bin="SnpSift.jar", 3624 tool="snpsift", 3625 bin_type="jar", 3626 config=config, 3627 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3628 ) 3629 if not snpsift_bin_command: 3630 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3631 log.error(msg_err) 3632 raise ValueError(msg_err) 3633 3634 # Config - bcftools 3635 bcftools_bin_command = get_bin_command( 3636 bin="bcftools", 3637 tool="bcftools", 3638 bin_type="bin", 3639 config=config, 3640 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3641 ) 3642 if not bcftools_bin_command: 3643 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3644 log.error(msg_err) 3645 raise ValueError(msg_err) 3646 3647 # Config - BCFTools databases folders 3648 databases_folders = set( 3649 self.get_config() 3650 .get("folders", {}) 3651 .get("databases", {}) 3652 .get("annotations", ["."]) 3653 + self.get_config() 3654 .get("folders", {}) 3655 .get("databases", {}) 3656 .get("bcftools", ["."]) 3657 ) 3658 log.debug("Databases annotations: " + str(databases_folders)) 3659 3660 # Param 3661 annotations = ( 3662 self.get_param() 3663 .get("annotation", {}) 3664 .get("snpsift", {}) 3665 .get("annotations", None) 3666 ) 3667 log.debug("Annotations: " + str(annotations)) 3668 3669 # Assembly 3670 assembly = self.get_param().get( 3671 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3672 ) 3673 3674 # Data 3675 table_variants = self.get_table_variants() 3676 3677 # Check if not empty 3678 log.debug("Check if not empty") 3679 sql_query_chromosomes = ( 3680 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3681 ) 3682 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3683 if not sql_query_chromosomes_df["count"][0]: 3684 log.info(f"VCF empty") 3685 return 3686 3687 # VCF header 3688 vcf_reader = self.get_header() 3689 log.debug("Initial header: " + str(vcf_reader.infos)) 3690 3691 # Existing annotations 3692 for vcf_annotation in self.get_header().infos: 3693 3694 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3695 log.debug( 3696 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3697 ) 3698 3699 if annotations: 3700 3701 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3702 3703 # Export VCF file 3704 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3705 3706 # Init 3707 commands = {} 3708 3709 for annotation in annotations: 3710 annotation_fields = annotations[annotation] 3711 3712 # Annotation Name 3713 annotation_name = os.path.basename(annotation) 3714 3715 if not annotation_fields: 3716 annotation_fields = {"INFO": None} 3717 3718 log.debug(f"Annotation '{annotation_name}'") 3719 log.debug( 3720 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3721 ) 3722 3723 # Create Database 3724 database = Database( 3725 database=annotation, 3726 databases_folders=databases_folders, 3727 assembly=assembly, 3728 ) 3729 3730 # Find files 3731 db_file = database.get_database() 3732 db_file = full_path(db_file) 3733 db_hdr_file = database.get_header_file() 3734 db_hdr_file = full_path(db_hdr_file) 3735 db_file_type = database.get_format() 3736 db_tbi_file = f"{db_file}.tbi" 3737 db_file_compressed = database.is_compressed() 3738 3739 # Check if compressed 3740 if not db_file_compressed: 3741 log.error( 3742 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3743 ) 3744 raise ValueError( 3745 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3746 ) 3747 3748 # Check if indexed 3749 if not os.path.exists(db_tbi_file): 3750 log.error( 3751 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3752 ) 3753 raise ValueError( 3754 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3755 ) 3756 3757 # Check index - try to create if not exists 3758 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3759 log.error("Annotation failed: database not valid") 3760 log.error(f"Annotation annotation file: {db_file}") 3761 log.error(f"Annotation annotation header: {db_hdr_file}") 3762 log.error(f"Annotation annotation index: {db_tbi_file}") 3763 raise ValueError( 3764 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3765 ) 3766 else: 3767 3768 log.debug( 3769 f"Annotation '{annotation}' - file: " 3770 + str(db_file) 3771 + " and " 3772 + str(db_hdr_file) 3773 ) 3774 3775 # Load header as VCF object 3776 db_hdr_vcf = Variants(input=db_hdr_file) 3777 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3778 log.debug( 3779 "Annotation database header: " 3780 + str(db_hdr_vcf_header_infos) 3781 ) 3782 3783 # For all fields in database 3784 annotation_fields_full = False 3785 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3786 annotation_fields = { 3787 key: key for key in db_hdr_vcf_header_infos 3788 } 3789 log.debug( 3790 "Annotation database header - All annotations added: " 3791 + str(annotation_fields) 3792 ) 3793 annotation_fields_full = True 3794 3795 # # Create file for field rename 3796 # log.debug("Create file for field rename") 3797 # tmp_rename = NamedTemporaryFile( 3798 # prefix=self.get_prefix(), 3799 # dir=self.get_tmp_dir(), 3800 # suffix=".rename", 3801 # delete=False, 3802 # ) 3803 # tmp_rename_name = tmp_rename.name 3804 # tmp_files.append(tmp_rename_name) 3805 3806 # Number of fields 3807 nb_annotation_field = 0 3808 annotation_list = [] 3809 annotation_infos_rename_list = [] 3810 3811 for annotation_field in annotation_fields: 3812 3813 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3814 annotation_fields_new_name = annotation_fields.get( 3815 annotation_field, annotation_field 3816 ) 3817 if not annotation_fields_new_name: 3818 annotation_fields_new_name = annotation_field 3819 3820 # Check if field is in DB and if field is not elready in input data 3821 if ( 3822 annotation_field in db_hdr_vcf.get_header().infos 3823 and annotation_fields_new_name 3824 not in self.get_header().infos 3825 ): 3826 3827 log.info( 3828 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3829 ) 3830 3831 # BCFTools annotate param to rename fields 3832 if annotation_field != annotation_fields_new_name: 3833 annotation_infos_rename_list.append( 3834 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3835 ) 3836 3837 # Add INFO field to header 3838 db_hdr_vcf_header_infos_number = ( 3839 db_hdr_vcf_header_infos[annotation_field].num or "." 3840 ) 3841 db_hdr_vcf_header_infos_type = ( 3842 db_hdr_vcf_header_infos[annotation_field].type 3843 or "String" 3844 ) 3845 db_hdr_vcf_header_infos_description = ( 3846 db_hdr_vcf_header_infos[annotation_field].desc 3847 or f"{annotation_field} description" 3848 ) 3849 db_hdr_vcf_header_infos_source = ( 3850 db_hdr_vcf_header_infos[annotation_field].source 3851 or "unknown" 3852 ) 3853 db_hdr_vcf_header_infos_version = ( 3854 db_hdr_vcf_header_infos[annotation_field].version 3855 or "unknown" 3856 ) 3857 3858 vcf_reader.infos[annotation_fields_new_name] = ( 3859 vcf.parser._Info( 3860 annotation_fields_new_name, 3861 db_hdr_vcf_header_infos_number, 3862 db_hdr_vcf_header_infos_type, 3863 db_hdr_vcf_header_infos_description, 3864 db_hdr_vcf_header_infos_source, 3865 db_hdr_vcf_header_infos_version, 3866 self.code_type_map[ 3867 db_hdr_vcf_header_infos_type 3868 ], 3869 ) 3870 ) 3871 3872 annotation_list.append(annotation_field) 3873 3874 nb_annotation_field += 1 3875 3876 else: 3877 3878 if ( 3879 annotation_field 3880 not in db_hdr_vcf.get_header().infos 3881 ): 3882 log.warning( 3883 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3884 ) 3885 if ( 3886 annotation_fields_new_name 3887 in self.get_header().infos 3888 ): 3889 log.warning( 3890 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3891 ) 3892 3893 log.info( 3894 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3895 ) 3896 3897 annotation_infos = ",".join(annotation_list) 3898 3899 if annotation_infos != "": 3900 3901 # Annotated VCF (and error file) 3902 tmp_annotation_vcf_name = os.path.join( 3903 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3904 ) 3905 tmp_annotation_vcf_name_err = ( 3906 tmp_annotation_vcf_name + ".err" 3907 ) 3908 3909 # Add fields to annotate 3910 if not annotation_fields_full: 3911 annotation_infos_option = f"-info {annotation_infos}" 3912 else: 3913 annotation_infos_option = "" 3914 3915 # Info fields rename 3916 if annotation_infos_rename_list: 3917 annotation_infos_rename = " -c " + ",".join( 3918 annotation_infos_rename_list 3919 ) 3920 else: 3921 annotation_infos_rename = "" 3922 3923 # Annotate command 3924 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3925 3926 # Add command 3927 commands[command_annotate] = tmp_annotation_vcf_name 3928 3929 if commands: 3930 3931 # Export VCF file 3932 self.export_variant_vcf( 3933 vcf_file=tmp_vcf_name, 3934 remove_info=True, 3935 add_samples=False, 3936 index=True, 3937 ) 3938 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3939 3940 # Num command 3941 nb_command = 0 3942 3943 # Annotate 3944 for command_annotate in commands: 3945 nb_command += 1 3946 log.info( 3947 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3948 ) 3949 log.debug(f"command_annotate={command_annotate}") 3950 run_parallel_commands([command_annotate], threads) 3951 3952 # Debug 3953 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3954 3955 # Update variants 3956 log.info( 3957 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3958 ) 3959 self.update_from_vcf(commands[command_annotate]) 3960 3961 def annotation_bcftools(self, threads: int = None) -> None: 3962 """ 3963 This function annotate with bcftools 3964 3965 :param threads: Number of threads to use 3966 :return: the value of the variable "return_value". 3967 """ 3968 3969 # DEBUG 3970 log.debug("Start annotation with bcftools databases") 3971 3972 # Threads 3973 if not threads: 3974 threads = self.get_threads() 3975 log.debug("Threads: " + str(threads)) 3976 3977 # Config 3978 config = self.get_config() 3979 log.debug("Config: " + str(config)) 3980 3981 # DEBUG 3982 delete_tmp = True 3983 if self.get_config().get("verbosity", "warning") in ["debug"]: 3984 delete_tmp = False 3985 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3986 3987 # Config - BCFTools bin command 3988 bcftools_bin_command = get_bin_command( 3989 bin="bcftools", 3990 tool="bcftools", 3991 bin_type="bin", 3992 config=config, 3993 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3994 ) 3995 if not bcftools_bin_command: 3996 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3997 log.error(msg_err) 3998 raise ValueError(msg_err) 3999 4000 # Config - BCFTools databases folders 4001 databases_folders = set( 4002 self.get_config() 4003 .get("folders", {}) 4004 .get("databases", {}) 4005 .get("annotations", ["."]) 4006 + self.get_config() 4007 .get("folders", {}) 4008 .get("databases", {}) 4009 .get("bcftools", ["."]) 4010 ) 4011 log.debug("Databases annotations: " + str(databases_folders)) 4012 4013 # Param 4014 annotations = ( 4015 self.get_param() 4016 .get("annotation", {}) 4017 .get("bcftools", {}) 4018 .get("annotations", None) 4019 ) 4020 log.debug("Annotations: " + str(annotations)) 4021 4022 # Assembly 4023 assembly = self.get_param().get( 4024 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4025 ) 4026 4027 # Data 4028 table_variants = self.get_table_variants() 4029 4030 # Check if not empty 4031 log.debug("Check if not empty") 4032 sql_query_chromosomes = ( 4033 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4034 ) 4035 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4036 if not sql_query_chromosomes_df["count"][0]: 4037 log.info(f"VCF empty") 4038 return 4039 4040 # Export in VCF 4041 log.debug("Create initial file to annotate") 4042 tmp_vcf = NamedTemporaryFile( 4043 prefix=self.get_prefix(), 4044 dir=self.get_tmp_dir(), 4045 suffix=".vcf.gz", 4046 delete=False, 4047 ) 4048 tmp_vcf_name = tmp_vcf.name 4049 4050 # VCF header 4051 vcf_reader = self.get_header() 4052 log.debug("Initial header: " + str(vcf_reader.infos)) 4053 4054 # Existing annotations 4055 for vcf_annotation in self.get_header().infos: 4056 4057 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4058 log.debug( 4059 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4060 ) 4061 4062 if annotations: 4063 4064 tmp_ann_vcf_list = [] 4065 commands = [] 4066 tmp_files = [] 4067 err_files = [] 4068 4069 for annotation in annotations: 4070 annotation_fields = annotations[annotation] 4071 4072 # Annotation Name 4073 annotation_name = os.path.basename(annotation) 4074 4075 if not annotation_fields: 4076 annotation_fields = {"INFO": None} 4077 4078 log.debug(f"Annotation '{annotation_name}'") 4079 log.debug( 4080 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4081 ) 4082 4083 # Create Database 4084 database = Database( 4085 database=annotation, 4086 databases_folders=databases_folders, 4087 assembly=assembly, 4088 ) 4089 4090 # Find files 4091 db_file = database.get_database() 4092 db_file = full_path(db_file) 4093 db_hdr_file = database.get_header_file() 4094 db_hdr_file = full_path(db_hdr_file) 4095 db_file_type = database.get_format() 4096 db_tbi_file = f"{db_file}.tbi" 4097 db_file_compressed = database.is_compressed() 4098 4099 # Check if compressed 4100 if not db_file_compressed: 4101 log.error( 4102 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4103 ) 4104 raise ValueError( 4105 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4106 ) 4107 4108 # Check if indexed 4109 if not os.path.exists(db_tbi_file): 4110 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4111 raise ValueError( 4112 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4113 ) 4114 4115 # Check index - try to create if not exists 4116 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4117 log.error("Annotation failed: database not valid") 4118 log.error(f"Annotation annotation file: {db_file}") 4119 log.error(f"Annotation annotation header: {db_hdr_file}") 4120 log.error(f"Annotation annotation index: {db_tbi_file}") 4121 raise ValueError( 4122 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4123 ) 4124 else: 4125 4126 log.debug( 4127 f"Annotation '{annotation}' - file: " 4128 + str(db_file) 4129 + " and " 4130 + str(db_hdr_file) 4131 ) 4132 4133 # Load header as VCF object 4134 db_hdr_vcf = Variants(input=db_hdr_file) 4135 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4136 log.debug( 4137 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4138 ) 4139 4140 # For all fields in database 4141 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4142 annotation_fields = { 4143 key: key for key in db_hdr_vcf_header_infos 4144 } 4145 log.debug( 4146 "Annotation database header - All annotations added: " 4147 + str(annotation_fields) 4148 ) 4149 4150 # Number of fields 4151 nb_annotation_field = 0 4152 annotation_list = [] 4153 4154 for annotation_field in annotation_fields: 4155 4156 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4157 annotation_fields_new_name = annotation_fields.get( 4158 annotation_field, annotation_field 4159 ) 4160 if not annotation_fields_new_name: 4161 annotation_fields_new_name = annotation_field 4162 4163 # Check if field is in DB and if field is not elready in input data 4164 if ( 4165 annotation_field in db_hdr_vcf.get_header().infos 4166 and annotation_fields_new_name 4167 not in self.get_header().infos 4168 ): 4169 4170 log.info( 4171 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4172 ) 4173 4174 # Add INFO field to header 4175 db_hdr_vcf_header_infos_number = ( 4176 db_hdr_vcf_header_infos[annotation_field].num or "." 4177 ) 4178 db_hdr_vcf_header_infos_type = ( 4179 db_hdr_vcf_header_infos[annotation_field].type 4180 or "String" 4181 ) 4182 db_hdr_vcf_header_infos_description = ( 4183 db_hdr_vcf_header_infos[annotation_field].desc 4184 or f"{annotation_field} description" 4185 ) 4186 db_hdr_vcf_header_infos_source = ( 4187 db_hdr_vcf_header_infos[annotation_field].source 4188 or "unknown" 4189 ) 4190 db_hdr_vcf_header_infos_version = ( 4191 db_hdr_vcf_header_infos[annotation_field].version 4192 or "unknown" 4193 ) 4194 4195 vcf_reader.infos[annotation_fields_new_name] = ( 4196 vcf.parser._Info( 4197 annotation_fields_new_name, 4198 db_hdr_vcf_header_infos_number, 4199 db_hdr_vcf_header_infos_type, 4200 db_hdr_vcf_header_infos_description, 4201 db_hdr_vcf_header_infos_source, 4202 db_hdr_vcf_header_infos_version, 4203 self.code_type_map[db_hdr_vcf_header_infos_type], 4204 ) 4205 ) 4206 4207 # annotation_list.append(annotation_field) 4208 if annotation_field != annotation_fields_new_name: 4209 annotation_list.append( 4210 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4211 ) 4212 else: 4213 annotation_list.append(annotation_field) 4214 4215 nb_annotation_field += 1 4216 4217 else: 4218 4219 if annotation_field not in db_hdr_vcf.get_header().infos: 4220 log.warning( 4221 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4222 ) 4223 if annotation_fields_new_name in self.get_header().infos: 4224 log.warning( 4225 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4226 ) 4227 4228 log.info( 4229 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4230 ) 4231 4232 annotation_infos = ",".join(annotation_list) 4233 4234 if annotation_infos != "": 4235 4236 # Protect header for bcftools (remove "#CHROM" and variants line) 4237 log.debug("Protect Header file - remove #CHROM line if exists") 4238 tmp_header_vcf = NamedTemporaryFile( 4239 prefix=self.get_prefix(), 4240 dir=self.get_tmp_dir(), 4241 suffix=".hdr", 4242 delete=False, 4243 ) 4244 tmp_header_vcf_name = tmp_header_vcf.name 4245 tmp_files.append(tmp_header_vcf_name) 4246 # Command 4247 if db_hdr_file.endswith(".gz"): 4248 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4249 else: 4250 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4251 # Run 4252 run_parallel_commands([command_extract_header], 1) 4253 4254 # Find chomosomes 4255 log.debug("Find chromosomes ") 4256 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4257 sql_query_chromosomes_df = self.get_query_to_df( 4258 sql_query_chromosomes 4259 ) 4260 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4261 4262 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4263 4264 # BED columns in the annotation file 4265 if db_file_type in ["bed"]: 4266 annotation_infos = "CHROM,POS,POS," + annotation_infos 4267 4268 for chrom in chomosomes_list: 4269 4270 # Create BED on initial VCF 4271 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4272 tmp_bed = NamedTemporaryFile( 4273 prefix=self.get_prefix(), 4274 dir=self.get_tmp_dir(), 4275 suffix=".bed", 4276 delete=False, 4277 ) 4278 tmp_bed_name = tmp_bed.name 4279 tmp_files.append(tmp_bed_name) 4280 4281 # Detecte regions 4282 log.debug( 4283 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4284 ) 4285 window = 1000000 4286 sql_query_intervals_for_bed = f""" 4287 SELECT \"#CHROM\", 4288 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4289 \"POS\"+{window} 4290 FROM {table_variants} as table_variants 4291 WHERE table_variants.\"#CHROM\" = '{chrom}' 4292 """ 4293 regions = self.conn.execute( 4294 sql_query_intervals_for_bed 4295 ).fetchall() 4296 merged_regions = merge_regions(regions) 4297 log.debug( 4298 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4299 ) 4300 4301 header = ["#CHROM", "START", "END"] 4302 with open(tmp_bed_name, "w") as f: 4303 # Write the header with tab delimiter 4304 f.write("\t".join(header) + "\n") 4305 for d in merged_regions: 4306 # Write each data row with tab delimiter 4307 f.write("\t".join(map(str, d)) + "\n") 4308 4309 # Tmp files 4310 tmp_annotation_vcf = NamedTemporaryFile( 4311 prefix=self.get_prefix(), 4312 dir=self.get_tmp_dir(), 4313 suffix=".vcf.gz", 4314 delete=False, 4315 ) 4316 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4317 tmp_files.append(tmp_annotation_vcf_name) 4318 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4319 tmp_annotation_vcf_name_err = ( 4320 tmp_annotation_vcf_name + ".err" 4321 ) 4322 err_files.append(tmp_annotation_vcf_name_err) 4323 4324 # Annotate Command 4325 log.debug( 4326 f"Annotation '{annotation}' - add bcftools command" 4327 ) 4328 4329 # Command 4330 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4331 4332 # Add command 4333 commands.append(command_annotate) 4334 4335 # if some commands 4336 if commands: 4337 4338 # Export VCF file 4339 self.export_variant_vcf( 4340 vcf_file=tmp_vcf_name, 4341 remove_info=True, 4342 add_samples=False, 4343 index=True, 4344 ) 4345 4346 # Threads 4347 # calculate threads for annotated commands 4348 if commands: 4349 threads_bcftools_annotate = round(threads / len(commands)) 4350 else: 4351 threads_bcftools_annotate = 1 4352 4353 if not threads_bcftools_annotate: 4354 threads_bcftools_annotate = 1 4355 4356 # Add threads option to bcftools commands 4357 if threads_bcftools_annotate > 1: 4358 commands_threaded = [] 4359 for command in commands: 4360 commands_threaded.append( 4361 command.replace( 4362 f"{bcftools_bin_command} annotate ", 4363 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4364 ) 4365 ) 4366 commands = commands_threaded 4367 4368 # Command annotation multithreading 4369 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4370 log.info( 4371 f"Annotation - Annotation multithreaded in " 4372 + str(len(commands)) 4373 + " commands" 4374 ) 4375 4376 run_parallel_commands(commands, threads) 4377 4378 # Merge 4379 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4380 4381 if tmp_ann_vcf_list_cmd: 4382 4383 # Tmp file 4384 tmp_annotate_vcf = NamedTemporaryFile( 4385 prefix=self.get_prefix(), 4386 dir=self.get_tmp_dir(), 4387 suffix=".vcf.gz", 4388 delete=True, 4389 ) 4390 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4391 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4392 err_files.append(tmp_annotate_vcf_name_err) 4393 4394 # Tmp file remove command 4395 tmp_files_remove_command = "" 4396 if tmp_files: 4397 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4398 4399 # Command merge 4400 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4401 log.info( 4402 f"Annotation - Annotation merging " 4403 + str(len(commands)) 4404 + " annotated files" 4405 ) 4406 log.debug(f"Annotation - merge command: {merge_command}") 4407 run_parallel_commands([merge_command], 1) 4408 4409 # Error messages 4410 log.info(f"Error/Warning messages:") 4411 error_message_command_all = [] 4412 error_message_command_warning = [] 4413 error_message_command_err = [] 4414 for err_file in err_files: 4415 with open(err_file, "r") as f: 4416 for line in f: 4417 message = line.strip() 4418 error_message_command_all.append(message) 4419 if line.startswith("[W::"): 4420 error_message_command_warning.append(message) 4421 if line.startswith("[E::"): 4422 error_message_command_err.append( 4423 f"{err_file}: " + message 4424 ) 4425 # log info 4426 for message in list( 4427 set(error_message_command_err + error_message_command_warning) 4428 ): 4429 log.info(f" {message}") 4430 # debug info 4431 for message in list(set(error_message_command_all)): 4432 log.debug(f" {message}") 4433 # failed 4434 if len(error_message_command_err): 4435 log.error("Annotation failed: Error in commands") 4436 raise ValueError("Annotation failed: Error in commands") 4437 4438 # Update variants 4439 log.info(f"Annotation - Updating...") 4440 self.update_from_vcf(tmp_annotate_vcf_name) 4441 4442 def annotation_exomiser(self, threads: int = None) -> None: 4443 """ 4444 This function annotate with Exomiser 4445 4446 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4447 - "analysis" (dict/file): 4448 Full analysis dictionnary parameters (see Exomiser docs). 4449 Either a dict, or a file in JSON or YAML format. 4450 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4451 Default : None 4452 - "preset" (string): 4453 Analysis preset (available in config folder). 4454 Used if no full "analysis" is provided. 4455 Default: "exome" 4456 - "phenopacket" (dict/file): 4457 Samples and phenotipic features parameters (see Exomiser docs). 4458 Either a dict, or a file in JSON or YAML format. 4459 Default: None 4460 - "subject" (dict): 4461 Sample parameters (see Exomiser docs). 4462 Example: 4463 "subject": 4464 { 4465 "id": "ISDBM322017", 4466 "sex": "FEMALE" 4467 } 4468 Default: None 4469 - "sample" (string): 4470 Sample name to construct "subject" section: 4471 "subject": 4472 { 4473 "id": "<sample>", 4474 "sex": "UNKNOWN_SEX" 4475 } 4476 Default: None 4477 - "phenotypicFeatures" (dict) 4478 Phenotypic features to construct "subject" section. 4479 Example: 4480 "phenotypicFeatures": 4481 [ 4482 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4483 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4484 ] 4485 - "hpo" (list) 4486 List of HPO ids as phenotypic features. 4487 Example: 4488 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4489 Default: [] 4490 - "outputOptions" (dict): 4491 Output options (see Exomiser docs). 4492 Default: 4493 "output_options" = 4494 { 4495 "outputContributingVariantsOnly": False, 4496 "numGenes": 0, 4497 "outputFormats": ["TSV_VARIANT", "VCF"] 4498 } 4499 - "transcript_source" (string): 4500 Transcript source (either "refseq", "ucsc", "ensembl") 4501 Default: "refseq" 4502 - "exomiser_to_info" (boolean): 4503 Add exomiser TSV file columns as INFO fields in VCF. 4504 Default: False 4505 - "release" (string): 4506 Exomise database release. 4507 If not exists, database release will be downloaded (take a while). 4508 Default: None (provided by application.properties configuration file) 4509 - "exomiser_application_properties" (file): 4510 Exomiser configuration file (see Exomiser docs). 4511 Useful to automatically download databases (especially for specific genome databases). 4512 4513 Notes: 4514 - If no sample in parameters, first sample in VCF will be chosen 4515 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4516 4517 :param threads: The number of threads to use 4518 :return: None. 4519 """ 4520 4521 # DEBUG 4522 log.debug("Start annotation with Exomiser databases") 4523 4524 # Threads 4525 if not threads: 4526 threads = self.get_threads() 4527 log.debug("Threads: " + str(threads)) 4528 4529 # Config 4530 config = self.get_config() 4531 log.debug("Config: " + str(config)) 4532 4533 # Config - Folders - Databases 4534 databases_folders = ( 4535 config.get("folders", {}) 4536 .get("databases", {}) 4537 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4538 ) 4539 databases_folders = full_path(databases_folders) 4540 if not os.path.exists(databases_folders): 4541 log.error(f"Databases annotations: {databases_folders} NOT found") 4542 log.debug("Databases annotations: " + str(databases_folders)) 4543 4544 # Config - Exomiser 4545 exomiser_bin_command = get_bin_command( 4546 bin="exomiser-cli*.jar", 4547 tool="exomiser", 4548 bin_type="jar", 4549 config=config, 4550 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4551 ) 4552 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4553 if not exomiser_bin_command: 4554 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4555 log.error(msg_err) 4556 raise ValueError(msg_err) 4557 4558 # Param 4559 param = self.get_param() 4560 log.debug("Param: " + str(param)) 4561 4562 # Param - Exomiser 4563 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4564 log.debug(f"Param Exomiser: {param_exomiser}") 4565 4566 # Param - Assembly 4567 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4568 log.debug("Assembly: " + str(assembly)) 4569 4570 # Data 4571 table_variants = self.get_table_variants() 4572 4573 # Check if not empty 4574 log.debug("Check if not empty") 4575 sql_query_chromosomes = ( 4576 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4577 ) 4578 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4579 log.info(f"VCF empty") 4580 return False 4581 4582 # VCF header 4583 vcf_reader = self.get_header() 4584 log.debug("Initial header: " + str(vcf_reader.infos)) 4585 4586 # Samples 4587 samples = self.get_header_sample_list() 4588 if not samples: 4589 log.error("No Samples in VCF") 4590 return False 4591 log.debug(f"Samples: {samples}") 4592 4593 # Memory limit 4594 memory_limit = self.get_memory("8G") 4595 log.debug(f"memory_limit: {memory_limit}") 4596 4597 # Exomiser java options 4598 exomiser_java_options = ( 4599 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4600 ) 4601 log.debug(f"Exomiser java options: {exomiser_java_options}") 4602 4603 # Download Exomiser (if not exists) 4604 exomiser_release = param_exomiser.get("release", None) 4605 exomiser_application_properties = param_exomiser.get( 4606 "exomiser_application_properties", None 4607 ) 4608 databases_download_exomiser( 4609 assemblies=[assembly], 4610 exomiser_folder=databases_folders, 4611 exomiser_release=exomiser_release, 4612 exomiser_phenotype_release=exomiser_release, 4613 exomiser_application_properties=exomiser_application_properties, 4614 ) 4615 4616 # Force annotation 4617 force_update_annotation = True 4618 4619 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4620 log.debug("Start annotation Exomiser") 4621 4622 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4623 4624 # tmp_dir = "/tmp/exomiser" 4625 4626 ### ANALYSIS ### 4627 ################ 4628 4629 # Create analysis.json through analysis dict 4630 # either analysis in param or by default 4631 # depending on preset exome/genome) 4632 4633 # Init analysis dict 4634 param_exomiser_analysis_dict = {} 4635 4636 # analysis from param 4637 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4638 param_exomiser_analysis = full_path(param_exomiser_analysis) 4639 4640 # If analysis in param -> load anlaysis json 4641 if param_exomiser_analysis: 4642 4643 # If param analysis is a file and exists 4644 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4645 param_exomiser_analysis 4646 ): 4647 # Load analysis file into analysis dict (either yaml or json) 4648 with open(param_exomiser_analysis) as json_file: 4649 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4650 4651 # If param analysis is a dict 4652 elif isinstance(param_exomiser_analysis, dict): 4653 # Load analysis dict into analysis dict (either yaml or json) 4654 param_exomiser_analysis_dict = param_exomiser_analysis 4655 4656 # Error analysis type 4657 else: 4658 log.error(f"Analysis type unknown. Check param file.") 4659 raise ValueError(f"Analysis type unknown. Check param file.") 4660 4661 # Case no input analysis config file/dict 4662 # Use preset (exome/genome) to open default config file 4663 if not param_exomiser_analysis_dict: 4664 4665 # default preset 4666 default_preset = "exome" 4667 4668 # Get param preset or default preset 4669 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4670 4671 # Try to find if preset is a file 4672 if os.path.exists(param_exomiser_preset): 4673 # Preset file is provided in full path 4674 param_exomiser_analysis_default_config_file = ( 4675 param_exomiser_preset 4676 ) 4677 # elif os.path.exists(full_path(param_exomiser_preset)): 4678 # # Preset file is provided in full path 4679 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4680 elif os.path.exists( 4681 os.path.join(folder_config, param_exomiser_preset) 4682 ): 4683 # Preset file is provided a basename in config folder (can be a path with subfolders) 4684 param_exomiser_analysis_default_config_file = os.path.join( 4685 folder_config, param_exomiser_preset 4686 ) 4687 else: 4688 # Construct preset file 4689 param_exomiser_analysis_default_config_file = os.path.join( 4690 folder_config, 4691 f"preset-{param_exomiser_preset}-analysis.json", 4692 ) 4693 4694 # If preset file exists 4695 param_exomiser_analysis_default_config_file = full_path( 4696 param_exomiser_analysis_default_config_file 4697 ) 4698 if os.path.exists(param_exomiser_analysis_default_config_file): 4699 # Load prest file into analysis dict (either yaml or json) 4700 with open( 4701 param_exomiser_analysis_default_config_file 4702 ) as json_file: 4703 # param_exomiser_analysis_dict[""] = json.load(json_file) 4704 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4705 json_file 4706 ) 4707 4708 # Error preset file 4709 else: 4710 log.error( 4711 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4712 ) 4713 raise ValueError( 4714 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4715 ) 4716 4717 # If no analysis dict created 4718 if not param_exomiser_analysis_dict: 4719 log.error(f"No analysis config") 4720 raise ValueError(f"No analysis config") 4721 4722 # Log 4723 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4724 4725 ### PHENOPACKET ### 4726 ################### 4727 4728 # If no PhenoPacket in analysis dict -> check in param 4729 if "phenopacket" not in param_exomiser_analysis_dict: 4730 4731 # If PhenoPacket in param -> load anlaysis json 4732 if param_exomiser.get("phenopacket", None): 4733 4734 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4735 param_exomiser_phenopacket = full_path( 4736 param_exomiser_phenopacket 4737 ) 4738 4739 # If param phenopacket is a file and exists 4740 if isinstance( 4741 param_exomiser_phenopacket, str 4742 ) and os.path.exists(param_exomiser_phenopacket): 4743 # Load phenopacket file into analysis dict (either yaml or json) 4744 with open(param_exomiser_phenopacket) as json_file: 4745 param_exomiser_analysis_dict["phenopacket"] = ( 4746 yaml.safe_load(json_file) 4747 ) 4748 4749 # If param phenopacket is a dict 4750 elif isinstance(param_exomiser_phenopacket, dict): 4751 # Load phenopacket dict into analysis dict (either yaml or json) 4752 param_exomiser_analysis_dict["phenopacket"] = ( 4753 param_exomiser_phenopacket 4754 ) 4755 4756 # Error phenopacket type 4757 else: 4758 log.error(f"Phenopacket type unknown. Check param file.") 4759 raise ValueError( 4760 f"Phenopacket type unknown. Check param file." 4761 ) 4762 4763 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4764 if "phenopacket" not in param_exomiser_analysis_dict: 4765 4766 # Init PhenoPacket 4767 param_exomiser_analysis_dict["phenopacket"] = { 4768 "id": "analysis", 4769 "proband": {}, 4770 } 4771 4772 ### Add subject ### 4773 4774 # If subject exists 4775 param_exomiser_subject = param_exomiser.get("subject", {}) 4776 4777 # If subject not exists -> found sample ID 4778 if not param_exomiser_subject: 4779 4780 # Found sample ID in param 4781 sample = param_exomiser.get("sample", None) 4782 4783 # Find sample ID (first sample) 4784 if not sample: 4785 sample_list = self.get_header_sample_list() 4786 if len(sample_list) > 0: 4787 sample = sample_list[0] 4788 else: 4789 log.error(f"No sample found") 4790 raise ValueError(f"No sample found") 4791 4792 # Create subject 4793 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4794 4795 # Add to dict 4796 param_exomiser_analysis_dict["phenopacket"][ 4797 "subject" 4798 ] = param_exomiser_subject 4799 4800 ### Add "phenotypicFeatures" ### 4801 4802 # If phenotypicFeatures exists 4803 param_exomiser_phenotypicfeatures = param_exomiser.get( 4804 "phenotypicFeatures", [] 4805 ) 4806 4807 # If phenotypicFeatures not exists -> Try to infer from hpo list 4808 if not param_exomiser_phenotypicfeatures: 4809 4810 # Found HPO in param 4811 param_exomiser_hpo = param_exomiser.get("hpo", []) 4812 4813 # Split HPO if list in string format separated by comma 4814 if isinstance(param_exomiser_hpo, str): 4815 param_exomiser_hpo = param_exomiser_hpo.split(",") 4816 4817 # Create HPO list 4818 for hpo in param_exomiser_hpo: 4819 hpo_clean = re.sub("[^0-9]", "", hpo) 4820 param_exomiser_phenotypicfeatures.append( 4821 { 4822 "type": { 4823 "id": f"HP:{hpo_clean}", 4824 "label": f"HP:{hpo_clean}", 4825 } 4826 } 4827 ) 4828 4829 # Add to dict 4830 param_exomiser_analysis_dict["phenopacket"][ 4831 "phenotypicFeatures" 4832 ] = param_exomiser_phenotypicfeatures 4833 4834 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4835 if not param_exomiser_phenotypicfeatures: 4836 for step in param_exomiser_analysis_dict.get( 4837 "analysis", {} 4838 ).get("steps", []): 4839 if "hiPhivePrioritiser" in step: 4840 param_exomiser_analysis_dict.get("analysis", {}).get( 4841 "steps", [] 4842 ).remove(step) 4843 4844 ### Add Input File ### 4845 4846 # Initial file name and htsFiles 4847 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4848 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4849 { 4850 "uri": tmp_vcf_name, 4851 "htsFormat": "VCF", 4852 "genomeAssembly": assembly, 4853 } 4854 ] 4855 4856 ### Add metaData ### 4857 4858 # If metaData not in analysis dict 4859 if "metaData" not in param_exomiser_analysis_dict: 4860 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4861 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4862 "createdBy": "howard", 4863 "phenopacketSchemaVersion": 1, 4864 } 4865 4866 ### OutputOptions ### 4867 4868 # Init output result folder 4869 output_results = os.path.join(tmp_dir, "results") 4870 4871 # If no outputOptions in analysis dict 4872 if "outputOptions" not in param_exomiser_analysis_dict: 4873 4874 # default output formats 4875 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4876 4877 # Get outputOptions in param 4878 output_options = param_exomiser.get("outputOptions", None) 4879 4880 # If no output_options in param -> check 4881 if not output_options: 4882 output_options = { 4883 "outputContributingVariantsOnly": False, 4884 "numGenes": 0, 4885 "outputFormats": defaut_output_formats, 4886 } 4887 4888 # Replace outputDirectory in output options 4889 output_options["outputDirectory"] = output_results 4890 output_options["outputFileName"] = "howard" 4891 4892 # Add outputOptions in analysis dict 4893 param_exomiser_analysis_dict["outputOptions"] = output_options 4894 4895 else: 4896 4897 # Replace output_results and output format (if exists in param) 4898 param_exomiser_analysis_dict["outputOptions"][ 4899 "outputDirectory" 4900 ] = output_results 4901 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4902 list( 4903 set( 4904 param_exomiser_analysis_dict.get( 4905 "outputOptions", {} 4906 ).get("outputFormats", []) 4907 + ["TSV_VARIANT", "VCF"] 4908 ) 4909 ) 4910 ) 4911 4912 # log 4913 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4914 4915 ### ANALYSIS FILE ### 4916 ##################### 4917 4918 ### Full JSON analysis config file ### 4919 4920 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4921 with open(exomiser_analysis, "w") as fp: 4922 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4923 4924 ### SPLIT analysis and sample config files 4925 4926 # Splitted analysis dict 4927 param_exomiser_analysis_dict_for_split = ( 4928 param_exomiser_analysis_dict.copy() 4929 ) 4930 4931 # Phenopacket JSON file 4932 exomiser_analysis_phenopacket = os.path.join( 4933 tmp_dir, "analysis_phenopacket.json" 4934 ) 4935 with open(exomiser_analysis_phenopacket, "w") as fp: 4936 json.dump( 4937 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4938 fp, 4939 indent=4, 4940 ) 4941 4942 # Analysis JSON file without Phenopacket parameters 4943 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4944 exomiser_analysis_analysis = os.path.join( 4945 tmp_dir, "analysis_analysis.json" 4946 ) 4947 with open(exomiser_analysis_analysis, "w") as fp: 4948 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4949 4950 ### INITAL VCF file ### 4951 ####################### 4952 4953 ### Create list of samples to use and include inti initial VCF file #### 4954 4955 # Subject (main sample) 4956 # Get sample ID in analysis dict 4957 sample_subject = ( 4958 param_exomiser_analysis_dict.get("phenopacket", {}) 4959 .get("subject", {}) 4960 .get("id", None) 4961 ) 4962 sample_proband = ( 4963 param_exomiser_analysis_dict.get("phenopacket", {}) 4964 .get("proband", {}) 4965 .get("subject", {}) 4966 .get("id", None) 4967 ) 4968 sample = [] 4969 if sample_subject: 4970 sample.append(sample_subject) 4971 if sample_proband: 4972 sample.append(sample_proband) 4973 4974 # Get sample ID within Pedigree 4975 pedigree_persons_list = ( 4976 param_exomiser_analysis_dict.get("phenopacket", {}) 4977 .get("pedigree", {}) 4978 .get("persons", {}) 4979 ) 4980 4981 # Create list with all sample ID in pedigree (if exists) 4982 pedigree_persons = [] 4983 for person in pedigree_persons_list: 4984 pedigree_persons.append(person.get("individualId")) 4985 4986 # Concat subject sample ID and samples ID in pedigreesamples 4987 samples = list(set(sample + pedigree_persons)) 4988 4989 # Check if sample list is not empty 4990 if not samples: 4991 log.error(f"No samples found") 4992 raise ValueError(f"No samples found") 4993 4994 # Create VCF with sample (either sample in param or first one by default) 4995 # Export VCF file 4996 self.export_variant_vcf( 4997 vcf_file=tmp_vcf_name, 4998 remove_info=True, 4999 add_samples=True, 5000 list_samples=samples, 5001 index=False, 5002 ) 5003 5004 ### Execute Exomiser ### 5005 ######################## 5006 5007 # Init command 5008 exomiser_command = "" 5009 5010 # Command exomiser options 5011 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5012 5013 # Release 5014 exomiser_release = param_exomiser.get("release", None) 5015 if exomiser_release: 5016 # phenotype data version 5017 exomiser_options += ( 5018 f" --exomiser.phenotype.data-version={exomiser_release} " 5019 ) 5020 # data version 5021 exomiser_options += ( 5022 f" --exomiser.{assembly}.data-version={exomiser_release} " 5023 ) 5024 # variant white list 5025 variant_white_list_file = ( 5026 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5027 ) 5028 if os.path.exists( 5029 os.path.join( 5030 databases_folders, assembly, variant_white_list_file 5031 ) 5032 ): 5033 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5034 5035 # transcript_source 5036 transcript_source = param_exomiser.get( 5037 "transcript_source", None 5038 ) # ucsc, refseq, ensembl 5039 if transcript_source: 5040 exomiser_options += ( 5041 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5042 ) 5043 5044 # If analysis contain proband param 5045 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5046 "proband", {} 5047 ): 5048 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5049 5050 # If no proband (usually uniq sample) 5051 else: 5052 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5053 5054 # Log 5055 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5056 5057 # Run command 5058 result = subprocess.call( 5059 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5060 ) 5061 if result: 5062 log.error("Exomiser command failed") 5063 raise ValueError("Exomiser command failed") 5064 5065 ### RESULTS ### 5066 ############### 5067 5068 ### Annotate with TSV fields ### 5069 5070 # Init result tsv file 5071 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5072 5073 # Init result tsv file 5074 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5075 5076 # Parse TSV file and explode columns in INFO field 5077 if exomiser_to_info and os.path.exists(output_results_tsv): 5078 5079 # Log 5080 log.debug("Exomiser columns to VCF INFO field") 5081 5082 # Retrieve columns and types 5083 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5084 output_results_tsv_df = self.get_query_to_df(query) 5085 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5086 5087 # Init concat fields for update 5088 sql_query_update_concat_fields = [] 5089 5090 # Fields to avoid 5091 fields_to_avoid = [ 5092 "CONTIG", 5093 "START", 5094 "END", 5095 "REF", 5096 "ALT", 5097 "QUAL", 5098 "FILTER", 5099 "GENOTYPE", 5100 ] 5101 5102 # List all columns to add into header 5103 for header_column in output_results_tsv_columns: 5104 5105 # If header column is enable 5106 if header_column not in fields_to_avoid: 5107 5108 # Header info type 5109 header_info_type = "String" 5110 header_column_df = output_results_tsv_df[header_column] 5111 header_column_df_dtype = header_column_df.dtype 5112 if header_column_df_dtype == object: 5113 if ( 5114 pd.to_numeric(header_column_df, errors="coerce") 5115 .notnull() 5116 .all() 5117 ): 5118 header_info_type = "Float" 5119 else: 5120 header_info_type = "Integer" 5121 5122 # Header info 5123 characters_to_validate = ["-"] 5124 pattern = "[" + "".join(characters_to_validate) + "]" 5125 header_info_name = re.sub( 5126 pattern, 5127 "_", 5128 f"Exomiser_{header_column}".replace("#", ""), 5129 ) 5130 header_info_number = "." 5131 header_info_description = ( 5132 f"Exomiser {header_column} annotation" 5133 ) 5134 header_info_source = "Exomiser" 5135 header_info_version = "unknown" 5136 header_info_code = CODE_TYPE_MAP[header_info_type] 5137 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5138 header_info_name, 5139 header_info_number, 5140 header_info_type, 5141 header_info_description, 5142 header_info_source, 5143 header_info_version, 5144 header_info_code, 5145 ) 5146 5147 # Add field to add for update to concat fields 5148 sql_query_update_concat_fields.append( 5149 f""" 5150 CASE 5151 WHEN table_parquet."{header_column}" NOT IN ('','.') 5152 THEN concat( 5153 '{header_info_name}=', 5154 table_parquet."{header_column}", 5155 ';' 5156 ) 5157 5158 ELSE '' 5159 END 5160 """ 5161 ) 5162 5163 # Update query 5164 sql_query_update = f""" 5165 UPDATE {table_variants} as table_variants 5166 SET INFO = concat( 5167 CASE 5168 WHEN INFO NOT IN ('', '.') 5169 THEN INFO 5170 ELSE '' 5171 END, 5172 CASE 5173 WHEN table_variants.INFO NOT IN ('','.') 5174 THEN ';' 5175 ELSE '' 5176 END, 5177 ( 5178 SELECT 5179 concat( 5180 {",".join(sql_query_update_concat_fields)} 5181 ) 5182 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5183 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5184 AND table_parquet.\"START\" = table_variants.\"POS\" 5185 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5186 AND table_parquet.\"REF\" = table_variants.\"REF\" 5187 ) 5188 ) 5189 ; 5190 """ 5191 5192 # Update 5193 self.conn.execute(sql_query_update) 5194 5195 ### Annotate with VCF INFO field ### 5196 5197 # Init result VCF file 5198 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5199 5200 # If VCF exists 5201 if os.path.exists(output_results_vcf): 5202 5203 # Log 5204 log.debug("Exomiser result VCF update variants") 5205 5206 # Find Exomiser INFO field annotation in header 5207 with gzip.open(output_results_vcf, "rt") as f: 5208 header_list = self.read_vcf_header(f) 5209 exomiser_vcf_header = vcf.Reader( 5210 io.StringIO("\n".join(header_list)) 5211 ) 5212 5213 # Add annotation INFO field to header 5214 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5215 5216 # Update variants with VCF 5217 self.update_from_vcf(output_results_vcf) 5218 5219 return True 5220 5221 def annotation_snpeff(self, threads: int = None) -> None: 5222 """ 5223 This function annotate with snpEff 5224 5225 :param threads: The number of threads to use 5226 :return: the value of the variable "return_value". 5227 """ 5228 5229 # DEBUG 5230 log.debug("Start annotation with snpeff databases") 5231 5232 # Threads 5233 if not threads: 5234 threads = self.get_threads() 5235 log.debug("Threads: " + str(threads)) 5236 5237 # DEBUG 5238 delete_tmp = True 5239 if self.get_config().get("verbosity", "warning") in ["debug"]: 5240 delete_tmp = False 5241 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5242 5243 # Config 5244 config = self.get_config() 5245 log.debug("Config: " + str(config)) 5246 5247 # Config - Folders - Databases 5248 databases_folders = ( 5249 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5250 ) 5251 log.debug("Databases annotations: " + str(databases_folders)) 5252 5253 # Config - snpEff bin command 5254 snpeff_bin_command = get_bin_command( 5255 bin="snpEff.jar", 5256 tool="snpeff", 5257 bin_type="jar", 5258 config=config, 5259 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5260 ) 5261 if not snpeff_bin_command: 5262 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5263 log.error(msg_err) 5264 raise ValueError(msg_err) 5265 5266 # Config - snpEff databases 5267 snpeff_databases = ( 5268 config.get("folders", {}) 5269 .get("databases", {}) 5270 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5271 ) 5272 snpeff_databases = full_path(snpeff_databases) 5273 if snpeff_databases is not None and snpeff_databases != "": 5274 log.debug(f"Create snpEff databases folder") 5275 if not os.path.exists(snpeff_databases): 5276 os.makedirs(snpeff_databases) 5277 5278 # Param 5279 param = self.get_param() 5280 log.debug("Param: " + str(param)) 5281 5282 # Param 5283 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5284 log.debug("Options: " + str(options)) 5285 5286 # Param - Assembly 5287 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5288 5289 # Param - Options 5290 snpeff_options = ( 5291 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5292 ) 5293 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5294 snpeff_csvstats = ( 5295 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5296 ) 5297 if snpeff_stats: 5298 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5299 snpeff_stats = full_path(snpeff_stats) 5300 snpeff_options += f" -stats {snpeff_stats}" 5301 if snpeff_csvstats: 5302 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5303 snpeff_csvstats = full_path(snpeff_csvstats) 5304 snpeff_options += f" -csvStats {snpeff_csvstats}" 5305 5306 # Data 5307 table_variants = self.get_table_variants() 5308 5309 # Check if not empty 5310 log.debug("Check if not empty") 5311 sql_query_chromosomes = ( 5312 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5313 ) 5314 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5315 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5316 log.info(f"VCF empty") 5317 return 5318 5319 # Export in VCF 5320 log.debug("Create initial file to annotate") 5321 tmp_vcf = NamedTemporaryFile( 5322 prefix=self.get_prefix(), 5323 dir=self.get_tmp_dir(), 5324 suffix=".vcf.gz", 5325 delete=True, 5326 ) 5327 tmp_vcf_name = tmp_vcf.name 5328 5329 # VCF header 5330 vcf_reader = self.get_header() 5331 log.debug("Initial header: " + str(vcf_reader.infos)) 5332 5333 # Existing annotations 5334 for vcf_annotation in self.get_header().infos: 5335 5336 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5337 log.debug( 5338 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5339 ) 5340 5341 # Memory limit 5342 # if config.get("memory", None): 5343 # memory_limit = config.get("memory", "8G") 5344 # else: 5345 # memory_limit = "8G" 5346 memory_limit = self.get_memory("8G") 5347 log.debug(f"memory_limit: {memory_limit}") 5348 5349 # snpEff java options 5350 snpeff_java_options = ( 5351 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5352 ) 5353 log.debug(f"Exomiser java options: {snpeff_java_options}") 5354 5355 force_update_annotation = True 5356 5357 if "ANN" not in self.get_header().infos or force_update_annotation: 5358 5359 # Check snpEff database 5360 log.debug(f"Check snpEff databases {[assembly]}") 5361 databases_download_snpeff( 5362 folder=snpeff_databases, assemblies=[assembly], config=config 5363 ) 5364 5365 # Export VCF file 5366 self.export_variant_vcf( 5367 vcf_file=tmp_vcf_name, 5368 remove_info=True, 5369 add_samples=False, 5370 index=True, 5371 ) 5372 5373 # Tmp file 5374 err_files = [] 5375 tmp_annotate_vcf = NamedTemporaryFile( 5376 prefix=self.get_prefix(), 5377 dir=self.get_tmp_dir(), 5378 suffix=".vcf", 5379 delete=False, 5380 ) 5381 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5382 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5383 err_files.append(tmp_annotate_vcf_name_err) 5384 5385 # Command 5386 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5387 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5388 run_parallel_commands([snpeff_command], 1) 5389 5390 # Error messages 5391 log.info(f"Error/Warning messages:") 5392 error_message_command_all = [] 5393 error_message_command_warning = [] 5394 error_message_command_err = [] 5395 for err_file in err_files: 5396 with open(err_file, "r") as f: 5397 for line in f: 5398 message = line.strip() 5399 error_message_command_all.append(message) 5400 if line.startswith("[W::"): 5401 error_message_command_warning.append(message) 5402 if line.startswith("[E::"): 5403 error_message_command_err.append(f"{err_file}: " + message) 5404 # log info 5405 for message in list( 5406 set(error_message_command_err + error_message_command_warning) 5407 ): 5408 log.info(f" {message}") 5409 # debug info 5410 for message in list(set(error_message_command_all)): 5411 log.debug(f" {message}") 5412 # failed 5413 if len(error_message_command_err): 5414 log.error("Annotation failed: Error in commands") 5415 raise ValueError("Annotation failed: Error in commands") 5416 5417 # Find annotation in header 5418 with open(tmp_annotate_vcf_name, "rt") as f: 5419 header_list = self.read_vcf_header(f) 5420 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5421 5422 for ann in annovar_vcf_header.infos: 5423 if ann not in self.get_header().infos: 5424 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5425 5426 # Update variants 5427 log.info(f"Annotation - Updating...") 5428 self.update_from_vcf(tmp_annotate_vcf_name) 5429 5430 else: 5431 if "ANN" in self.get_header().infos: 5432 log.debug(f"Existing snpEff annotations in VCF") 5433 if force_update_annotation: 5434 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5435 5436 def annotation_annovar(self, threads: int = None) -> None: 5437 """ 5438 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5439 annotations 5440 5441 :param threads: number of threads to use 5442 :return: the value of the variable "return_value". 5443 """ 5444 5445 # DEBUG 5446 log.debug("Start annotation with Annovar databases") 5447 5448 # Threads 5449 if not threads: 5450 threads = self.get_threads() 5451 log.debug("Threads: " + str(threads)) 5452 5453 # Tmp en Err files 5454 tmp_files = [] 5455 err_files = [] 5456 5457 # DEBUG 5458 delete_tmp = True 5459 if self.get_config().get("verbosity", "warning") in ["debug"]: 5460 delete_tmp = False 5461 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5462 5463 # Config 5464 config = self.get_config() 5465 log.debug("Config: " + str(config)) 5466 5467 # Config - Folders - Databases 5468 databases_folders = ( 5469 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5470 ) 5471 log.debug("Databases annotations: " + str(databases_folders)) 5472 5473 # Config - annovar bin command 5474 annovar_bin_command = get_bin_command( 5475 bin="table_annovar.pl", 5476 tool="annovar", 5477 bin_type="perl", 5478 config=config, 5479 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5480 ) 5481 if not annovar_bin_command: 5482 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5483 log.error(msg_err) 5484 raise ValueError(msg_err) 5485 5486 # Config - BCFTools bin command 5487 bcftools_bin_command = get_bin_command( 5488 bin="bcftools", 5489 tool="bcftools", 5490 bin_type="bin", 5491 config=config, 5492 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5493 ) 5494 if not bcftools_bin_command: 5495 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5496 log.error(msg_err) 5497 raise ValueError(msg_err) 5498 5499 # Config - annovar databases 5500 annovar_databases = ( 5501 config.get("folders", {}) 5502 .get("databases", {}) 5503 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5504 ) 5505 if annovar_databases is not None: 5506 if isinstance(annovar_databases, list): 5507 annovar_databases = full_path(annovar_databases[0]) 5508 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5509 annovar_databases = full_path(annovar_databases) 5510 if not os.path.exists(annovar_databases): 5511 log.info(f"Annovar databases folder '{annovar_databases}' created") 5512 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5513 else: 5514 msg_err = f"Annovar databases configuration failed" 5515 log.error(msg_err) 5516 raise ValueError(msg_err) 5517 5518 # Param 5519 param = self.get_param() 5520 log.debug("Param: " + str(param)) 5521 5522 # Param - options 5523 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5524 log.debug("Options: " + str(options)) 5525 5526 # Param - annotations 5527 annotations = ( 5528 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5529 ) 5530 log.debug("Annotations: " + str(annotations)) 5531 5532 # Param - Assembly 5533 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5534 5535 # Annovar database assembly 5536 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5537 if annovar_databases_assembly != "" and not os.path.exists( 5538 annovar_databases_assembly 5539 ): 5540 os.makedirs(annovar_databases_assembly) 5541 5542 # Data 5543 table_variants = self.get_table_variants() 5544 5545 # Check if not empty 5546 log.debug("Check if not empty") 5547 sql_query_chromosomes = ( 5548 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5549 ) 5550 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5551 if not sql_query_chromosomes_df["count"][0]: 5552 log.info(f"VCF empty") 5553 return 5554 5555 # VCF header 5556 vcf_reader = self.get_header() 5557 log.debug("Initial header: " + str(vcf_reader.infos)) 5558 5559 # Existing annotations 5560 for vcf_annotation in self.get_header().infos: 5561 5562 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5563 log.debug( 5564 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5565 ) 5566 5567 force_update_annotation = True 5568 5569 if annotations: 5570 5571 commands = [] 5572 tmp_annotates_vcf_name_list = [] 5573 5574 # Export in VCF 5575 log.debug("Create initial file to annotate") 5576 tmp_vcf = NamedTemporaryFile( 5577 prefix=self.get_prefix(), 5578 dir=self.get_tmp_dir(), 5579 suffix=".vcf.gz", 5580 delete=False, 5581 ) 5582 tmp_vcf_name = tmp_vcf.name 5583 tmp_files.append(tmp_vcf_name) 5584 tmp_files.append(tmp_vcf_name + ".tbi") 5585 5586 # Export VCF file 5587 self.export_variant_vcf( 5588 vcf_file=tmp_vcf_name, 5589 remove_info=".", 5590 add_samples=False, 5591 index=True, 5592 ) 5593 5594 # Create file for field rename 5595 log.debug("Create file for field rename") 5596 tmp_rename = NamedTemporaryFile( 5597 prefix=self.get_prefix(), 5598 dir=self.get_tmp_dir(), 5599 suffix=".rename", 5600 delete=False, 5601 ) 5602 tmp_rename_name = tmp_rename.name 5603 tmp_files.append(tmp_rename_name) 5604 5605 # Check Annovar database 5606 log.debug( 5607 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5608 ) 5609 databases_download_annovar( 5610 folder=annovar_databases, 5611 files=list(annotations.keys()), 5612 assemblies=[assembly], 5613 ) 5614 5615 for annotation in annotations: 5616 annotation_fields = annotations[annotation] 5617 5618 if not annotation_fields: 5619 annotation_fields = {"INFO": None} 5620 5621 log.info(f"Annotations Annovar - database '{annotation}'") 5622 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5623 5624 # Tmp file for annovar 5625 err_files = [] 5626 tmp_annotate_vcf_directory = TemporaryDirectory( 5627 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5628 ) 5629 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5630 tmp_annotate_vcf_name_annovar = ( 5631 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5632 ) 5633 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5634 err_files.append(tmp_annotate_vcf_name_err) 5635 tmp_files.append(tmp_annotate_vcf_name_err) 5636 5637 # Tmp file final vcf annotated by annovar 5638 tmp_annotate_vcf = NamedTemporaryFile( 5639 prefix=self.get_prefix(), 5640 dir=self.get_tmp_dir(), 5641 suffix=".vcf.gz", 5642 delete=False, 5643 ) 5644 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5645 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5646 tmp_files.append(tmp_annotate_vcf_name) 5647 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5648 5649 # Number of fields 5650 annotation_list = [] 5651 annotation_renamed_list = [] 5652 5653 for annotation_field in annotation_fields: 5654 5655 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5656 annotation_fields_new_name = annotation_fields.get( 5657 annotation_field, annotation_field 5658 ) 5659 if not annotation_fields_new_name: 5660 annotation_fields_new_name = annotation_field 5661 5662 if ( 5663 force_update_annotation 5664 or annotation_fields_new_name not in self.get_header().infos 5665 ): 5666 annotation_list.append(annotation_field) 5667 annotation_renamed_list.append(annotation_fields_new_name) 5668 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5669 log.warning( 5670 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5671 ) 5672 5673 # Add rename info 5674 run_parallel_commands( 5675 [ 5676 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5677 ], 5678 1, 5679 ) 5680 5681 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5682 log.debug("annotation_list: " + str(annotation_list)) 5683 5684 # protocol 5685 protocol = annotation 5686 5687 # argument 5688 argument = "" 5689 5690 # operation 5691 operation = "f" 5692 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5693 "ensGene" 5694 ): 5695 operation = "g" 5696 if options.get("genebase", None): 5697 argument = f"""'{options.get("genebase","")}'""" 5698 elif annotation in ["cytoBand"]: 5699 operation = "r" 5700 5701 # argument option 5702 argument_option = "" 5703 if argument != "": 5704 argument_option = " --argument " + argument 5705 5706 # command options 5707 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5708 for option in options: 5709 if option not in ["genebase"]: 5710 command_options += f""" --{option}={options[option]}""" 5711 5712 # Command 5713 5714 # Command - Annovar 5715 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5716 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5717 5718 # Command - start pipe 5719 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5720 5721 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5722 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5723 5724 # Command - Special characters (refGene annotation) 5725 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5726 5727 # Command - Clean empty fields (with value ".") 5728 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5729 5730 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5731 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5732 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5733 # for ann in annotation_renamed_list: 5734 for ann in annotation_list: 5735 annovar_fields_to_keep.append(f"^INFO/{ann}") 5736 5737 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5738 5739 # Command - indexing 5740 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5741 5742 log.debug(f"Annotation - Annovar command: {command_annovar}") 5743 run_parallel_commands([command_annovar], 1) 5744 5745 # Error messages 5746 log.info(f"Error/Warning messages:") 5747 error_message_command_all = [] 5748 error_message_command_warning = [] 5749 error_message_command_err = [] 5750 for err_file in err_files: 5751 with open(err_file, "r") as f: 5752 for line in f: 5753 message = line.strip() 5754 error_message_command_all.append(message) 5755 if line.startswith("[W::") or line.startswith("WARNING"): 5756 error_message_command_warning.append(message) 5757 if line.startswith("[E::") or line.startswith("ERROR"): 5758 error_message_command_err.append( 5759 f"{err_file}: " + message 5760 ) 5761 # log info 5762 for message in list( 5763 set(error_message_command_err + error_message_command_warning) 5764 ): 5765 log.info(f" {message}") 5766 # debug info 5767 for message in list(set(error_message_command_all)): 5768 log.debug(f" {message}") 5769 # failed 5770 if len(error_message_command_err): 5771 log.error("Annotation failed: Error in commands") 5772 raise ValueError("Annotation failed: Error in commands") 5773 5774 if tmp_annotates_vcf_name_list: 5775 5776 # List of annotated files 5777 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5778 5779 # Tmp file 5780 tmp_annotate_vcf = NamedTemporaryFile( 5781 prefix=self.get_prefix(), 5782 dir=self.get_tmp_dir(), 5783 suffix=".vcf.gz", 5784 delete=False, 5785 ) 5786 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5787 tmp_files.append(tmp_annotate_vcf_name) 5788 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5789 err_files.append(tmp_annotate_vcf_name_err) 5790 tmp_files.append(tmp_annotate_vcf_name_err) 5791 5792 # Command merge 5793 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5794 log.info( 5795 f"Annotation Annovar - Annotation merging " 5796 + str(len(tmp_annotates_vcf_name_list)) 5797 + " annotated files" 5798 ) 5799 log.debug(f"Annotation - merge command: {merge_command}") 5800 run_parallel_commands([merge_command], 1) 5801 5802 # Find annotation in header 5803 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5804 header_list = self.read_vcf_header(f) 5805 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5806 5807 for ann in annovar_vcf_header.infos: 5808 if ann not in self.get_header().infos: 5809 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5810 5811 # Update variants 5812 log.info(f"Annotation Annovar - Updating...") 5813 self.update_from_vcf(tmp_annotate_vcf_name) 5814 5815 # Clean files 5816 # Tmp file remove command 5817 if True: 5818 tmp_files_remove_command = "" 5819 if tmp_files: 5820 tmp_files_remove_command = " ".join(tmp_files) 5821 clean_command = f" rm -f {tmp_files_remove_command} " 5822 log.debug(f"Annotation Annovar - Annotation cleaning ") 5823 log.debug(f"Annotation - cleaning command: {clean_command}") 5824 run_parallel_commands([clean_command], 1) 5825 5826 # Parquet 5827 def annotation_parquet(self, threads: int = None) -> None: 5828 """ 5829 It takes a VCF file, and annotates it with a parquet file 5830 5831 :param threads: number of threads to use for the annotation 5832 :return: the value of the variable "result". 5833 """ 5834 5835 # DEBUG 5836 log.debug("Start annotation with parquet databases") 5837 5838 # Threads 5839 if not threads: 5840 threads = self.get_threads() 5841 log.debug("Threads: " + str(threads)) 5842 5843 # DEBUG 5844 delete_tmp = True 5845 if self.get_config().get("verbosity", "warning") in ["debug"]: 5846 delete_tmp = False 5847 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5848 5849 # Config 5850 databases_folders = set( 5851 self.get_config() 5852 .get("folders", {}) 5853 .get("databases", {}) 5854 .get("annotations", ["."]) 5855 + self.get_config() 5856 .get("folders", {}) 5857 .get("databases", {}) 5858 .get("parquet", ["."]) 5859 ) 5860 log.debug("Databases annotations: " + str(databases_folders)) 5861 5862 # Param 5863 annotations = ( 5864 self.get_param() 5865 .get("annotation", {}) 5866 .get("parquet", {}) 5867 .get("annotations", None) 5868 ) 5869 log.debug("Annotations: " + str(annotations)) 5870 5871 # Assembly 5872 assembly = self.get_param().get( 5873 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5874 ) 5875 5876 # Force Update Annotation 5877 force_update_annotation = ( 5878 self.get_param() 5879 .get("annotation", {}) 5880 .get("options", {}) 5881 .get("annotations_update", False) 5882 ) 5883 log.debug(f"force_update_annotation={force_update_annotation}") 5884 force_append_annotation = ( 5885 self.get_param() 5886 .get("annotation", {}) 5887 .get("options", {}) 5888 .get("annotations_append", False) 5889 ) 5890 log.debug(f"force_append_annotation={force_append_annotation}") 5891 5892 # Data 5893 table_variants = self.get_table_variants() 5894 5895 # Check if not empty 5896 log.debug("Check if not empty") 5897 sql_query_chromosomes_df = self.get_query_to_df( 5898 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5899 ) 5900 if not sql_query_chromosomes_df["count"][0]: 5901 log.info(f"VCF empty") 5902 return 5903 5904 # VCF header 5905 vcf_reader = self.get_header() 5906 log.debug("Initial header: " + str(vcf_reader.infos)) 5907 5908 # Nb Variants POS 5909 log.debug("NB Variants Start") 5910 nb_variants = self.conn.execute( 5911 f"SELECT count(*) AS count FROM variants" 5912 ).fetchdf()["count"][0] 5913 log.debug("NB Variants Stop") 5914 5915 # Existing annotations 5916 for vcf_annotation in self.get_header().infos: 5917 5918 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5919 log.debug( 5920 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5921 ) 5922 5923 # Added columns 5924 added_columns = [] 5925 5926 # drop indexes 5927 log.debug(f"Drop indexes...") 5928 self.drop_indexes() 5929 5930 if annotations: 5931 5932 if "ALL" in annotations: 5933 5934 all_param = annotations.get("ALL", {}) 5935 all_param_formats = all_param.get("formats", None) 5936 all_param_releases = all_param.get("releases", None) 5937 5938 databases_infos_dict = self.scan_databases( 5939 database_formats=all_param_formats, 5940 database_releases=all_param_releases, 5941 ) 5942 for database_infos in databases_infos_dict.keys(): 5943 if database_infos not in annotations: 5944 annotations[database_infos] = {"INFO": None} 5945 5946 for annotation in annotations: 5947 5948 if annotation in ["ALL"]: 5949 continue 5950 5951 # Annotation Name 5952 annotation_name = os.path.basename(annotation) 5953 5954 # Annotation fields 5955 annotation_fields = annotations[annotation] 5956 if not annotation_fields: 5957 annotation_fields = {"INFO": None} 5958 5959 log.debug(f"Annotation '{annotation_name}'") 5960 log.debug( 5961 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5962 ) 5963 5964 # Create Database 5965 database = Database( 5966 database=annotation, 5967 databases_folders=databases_folders, 5968 assembly=assembly, 5969 ) 5970 5971 # Find files 5972 parquet_file = database.get_database() 5973 parquet_hdr_file = database.get_header_file() 5974 parquet_type = database.get_type() 5975 5976 # Check if files exists 5977 if not parquet_file or not parquet_hdr_file: 5978 msg_err_list = [] 5979 if not parquet_file: 5980 msg_err_list.append( 5981 f"Annotation failed: Annotation file not found" 5982 ) 5983 if parquet_file and not parquet_hdr_file: 5984 msg_err_list.append( 5985 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 5986 ) 5987 5988 log.error(". ".join(msg_err_list)) 5989 raise ValueError(". ".join(msg_err_list)) 5990 else: 5991 # Get parquet connexion 5992 parquet_sql_attach = database.get_sql_database_attach( 5993 output="query" 5994 ) 5995 if parquet_sql_attach: 5996 self.conn.execute(parquet_sql_attach) 5997 parquet_file_link = database.get_sql_database_link() 5998 # Log 5999 log.debug( 6000 f"Annotation '{annotation_name}' - file: " 6001 + str(parquet_file) 6002 + " and " 6003 + str(parquet_hdr_file) 6004 ) 6005 6006 # Database full header columns 6007 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6008 parquet_hdr_file 6009 ) 6010 # Log 6011 log.debug( 6012 "Annotation database header columns : " 6013 + str(parquet_hdr_vcf_header_columns) 6014 ) 6015 6016 # Load header as VCF object 6017 parquet_hdr_vcf_header_infos = database.get_header().infos 6018 # Log 6019 log.debug( 6020 "Annotation database header: " 6021 + str(parquet_hdr_vcf_header_infos) 6022 ) 6023 6024 # Get extra infos 6025 parquet_columns = database.get_extra_columns() 6026 # Log 6027 log.debug("Annotation database Columns: " + str(parquet_columns)) 6028 6029 # Add extra columns if "ALL" in annotation_fields 6030 # if "ALL" in annotation_fields: 6031 # allow_add_extra_column = True 6032 if "ALL" in annotation_fields and database.get_extra_columns(): 6033 for extra_column in database.get_extra_columns(): 6034 if ( 6035 extra_column not in annotation_fields 6036 and extra_column.replace("INFO/", "") 6037 not in parquet_hdr_vcf_header_infos 6038 ): 6039 parquet_hdr_vcf_header_infos[extra_column] = ( 6040 vcf.parser._Info( 6041 extra_column, 6042 ".", 6043 "String", 6044 f"{extra_column} description", 6045 "unknown", 6046 "unknown", 6047 self.code_type_map["String"], 6048 ) 6049 ) 6050 6051 # For all fields in database 6052 annotation_fields_all = False 6053 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6054 annotation_fields_all = True 6055 annotation_fields = { 6056 key: key for key in parquet_hdr_vcf_header_infos 6057 } 6058 6059 log.debug( 6060 "Annotation database header - All annotations added: " 6061 + str(annotation_fields) 6062 ) 6063 6064 # Init 6065 6066 # List of annotation fields to use 6067 sql_query_annotation_update_info_sets = [] 6068 6069 # List of annotation to agregate 6070 sql_query_annotation_to_agregate = [] 6071 6072 # Number of fields 6073 nb_annotation_field = 0 6074 6075 # Annotation fields processed 6076 annotation_fields_processed = [] 6077 6078 # Columns mapping 6079 map_columns = database.map_columns( 6080 columns=annotation_fields, prefixes=["INFO/"] 6081 ) 6082 6083 # Query dict for fields to remove (update option) 6084 query_dict_remove = {} 6085 6086 # Fetch Anotation fields 6087 for annotation_field in annotation_fields: 6088 6089 # annotation_field_column 6090 annotation_field_column = map_columns.get( 6091 annotation_field, "INFO" 6092 ) 6093 6094 # field new name, if parametered 6095 annotation_fields_new_name = annotation_fields.get( 6096 annotation_field, annotation_field 6097 ) 6098 if not annotation_fields_new_name: 6099 annotation_fields_new_name = annotation_field 6100 6101 # To annotate 6102 # force_update_annotation = True 6103 # force_append_annotation = True 6104 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6105 if annotation_field in parquet_hdr_vcf_header_infos and ( 6106 force_update_annotation 6107 or force_append_annotation 6108 or ( 6109 annotation_fields_new_name 6110 not in self.get_header().infos 6111 ) 6112 ): 6113 6114 # Add field to annotation to process list 6115 annotation_fields_processed.append( 6116 annotation_fields_new_name 6117 ) 6118 6119 # explode infos for the field 6120 annotation_fields_new_name_info_msg = "" 6121 if ( 6122 force_update_annotation 6123 and annotation_fields_new_name 6124 in self.get_header().infos 6125 ): 6126 # Remove field from INFO 6127 query = f""" 6128 UPDATE {table_variants} as table_variants 6129 SET INFO = REGEXP_REPLACE( 6130 concat(table_variants.INFO,''), 6131 ';*{annotation_fields_new_name}=[^;]*', 6132 '' 6133 ) 6134 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6135 """ 6136 annotation_fields_new_name_info_msg = " [update]" 6137 query_dict_remove[ 6138 f"remove 'INFO/{annotation_fields_new_name}'" 6139 ] = query 6140 6141 # Sep between fields in INFO 6142 nb_annotation_field += 1 6143 if nb_annotation_field > 1: 6144 annotation_field_sep = ";" 6145 else: 6146 annotation_field_sep = "" 6147 6148 log.info( 6149 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6150 ) 6151 6152 # Add INFO field to header 6153 parquet_hdr_vcf_header_infos_number = ( 6154 parquet_hdr_vcf_header_infos[annotation_field].num 6155 or "." 6156 ) 6157 parquet_hdr_vcf_header_infos_type = ( 6158 parquet_hdr_vcf_header_infos[annotation_field].type 6159 or "String" 6160 ) 6161 parquet_hdr_vcf_header_infos_description = ( 6162 parquet_hdr_vcf_header_infos[annotation_field].desc 6163 or f"{annotation_field} description" 6164 ) 6165 parquet_hdr_vcf_header_infos_source = ( 6166 parquet_hdr_vcf_header_infos[annotation_field].source 6167 or "unknown" 6168 ) 6169 parquet_hdr_vcf_header_infos_version = ( 6170 parquet_hdr_vcf_header_infos[annotation_field].version 6171 or "unknown" 6172 ) 6173 6174 vcf_reader.infos[annotation_fields_new_name] = ( 6175 vcf.parser._Info( 6176 annotation_fields_new_name, 6177 parquet_hdr_vcf_header_infos_number, 6178 parquet_hdr_vcf_header_infos_type, 6179 parquet_hdr_vcf_header_infos_description, 6180 parquet_hdr_vcf_header_infos_source, 6181 parquet_hdr_vcf_header_infos_version, 6182 self.code_type_map[ 6183 parquet_hdr_vcf_header_infos_type 6184 ], 6185 ) 6186 ) 6187 6188 # Append 6189 if force_append_annotation: 6190 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6191 else: 6192 query_case_when_append = "" 6193 6194 # Annotation/Update query fields 6195 # Found in INFO column 6196 if ( 6197 annotation_field_column == "INFO" 6198 and "INFO" in parquet_hdr_vcf_header_columns 6199 ): 6200 sql_query_annotation_update_info_sets.append( 6201 f""" 6202 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6203 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6204 ELSE '' 6205 END 6206 """ 6207 ) 6208 # Found in a specific column 6209 else: 6210 sql_query_annotation_update_info_sets.append( 6211 f""" 6212 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6213 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6214 ELSE '' 6215 END 6216 """ 6217 ) 6218 sql_query_annotation_to_agregate.append( 6219 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6220 ) 6221 6222 # Not to annotate 6223 else: 6224 6225 if force_update_annotation: 6226 annotation_message = "forced" 6227 else: 6228 annotation_message = "skipped" 6229 6230 if annotation_field not in parquet_hdr_vcf_header_infos: 6231 log.warning( 6232 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6233 ) 6234 if annotation_fields_new_name in self.get_header().infos: 6235 log.warning( 6236 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6237 ) 6238 6239 # Check if ALL fields have to be annotated. Thus concat all INFO field 6240 # allow_annotation_full_info = True 6241 allow_annotation_full_info = not force_append_annotation 6242 6243 if parquet_type in ["regions"]: 6244 allow_annotation_full_info = False 6245 6246 if ( 6247 allow_annotation_full_info 6248 and nb_annotation_field == len(annotation_fields) 6249 and annotation_fields_all 6250 and ( 6251 "INFO" in parquet_hdr_vcf_header_columns 6252 and "INFO" in database.get_extra_columns() 6253 ) 6254 ): 6255 log.debug("Column INFO annotation enabled") 6256 sql_query_annotation_update_info_sets = [] 6257 sql_query_annotation_update_info_sets.append( 6258 f" table_parquet.INFO " 6259 ) 6260 6261 if sql_query_annotation_update_info_sets: 6262 6263 # Annotate 6264 log.info(f"Annotation '{annotation_name}' - Annotation...") 6265 6266 # Join query annotation update info sets for SQL 6267 sql_query_annotation_update_info_sets_sql = ",".join( 6268 sql_query_annotation_update_info_sets 6269 ) 6270 6271 # Check chromosomes list (and variants infos) 6272 sql_query_chromosomes = f""" 6273 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6274 FROM {table_variants} as table_variants 6275 GROUP BY table_variants."#CHROM" 6276 ORDER BY table_variants."#CHROM" 6277 """ 6278 sql_query_chromosomes_df = self.conn.execute( 6279 sql_query_chromosomes 6280 ).df() 6281 sql_query_chromosomes_dict = { 6282 entry["CHROM"]: { 6283 "count": entry["count_variants"], 6284 "min": entry["min_variants"], 6285 "max": entry["max_variants"], 6286 } 6287 for index, entry in sql_query_chromosomes_df.iterrows() 6288 } 6289 6290 # Init 6291 nb_of_query = 0 6292 nb_of_variant_annotated = 0 6293 query_dict = query_dict_remove 6294 6295 # for chrom in sql_query_chromosomes_df["CHROM"]: 6296 for chrom in sql_query_chromosomes_dict: 6297 6298 # Number of variant by chromosome 6299 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6300 chrom, {} 6301 ).get("count", 0) 6302 6303 log.debug( 6304 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6305 ) 6306 6307 # Annotation with regions database 6308 if parquet_type in ["regions"]: 6309 sql_query_annotation_from_clause = f""" 6310 FROM ( 6311 SELECT 6312 '{chrom}' AS \"#CHROM\", 6313 table_variants_from.\"POS\" AS \"POS\", 6314 {",".join(sql_query_annotation_to_agregate)} 6315 FROM {table_variants} as table_variants_from 6316 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6317 table_parquet_from."#CHROM" = '{chrom}' 6318 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6319 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6320 ) 6321 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6322 GROUP BY table_variants_from.\"POS\" 6323 ) 6324 as table_parquet 6325 """ 6326 6327 sql_query_annotation_where_clause = """ 6328 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6329 AND table_parquet.\"POS\" = table_variants.\"POS\" 6330 """ 6331 6332 # Annotation with variants database 6333 else: 6334 sql_query_annotation_from_clause = f""" 6335 FROM {parquet_file_link} as table_parquet 6336 """ 6337 sql_query_annotation_where_clause = f""" 6338 table_variants."#CHROM" = '{chrom}' 6339 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6340 AND table_parquet.\"POS\" = table_variants.\"POS\" 6341 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6342 AND table_parquet.\"REF\" = table_variants.\"REF\" 6343 """ 6344 6345 # Create update query 6346 sql_query_annotation_chrom_interval_pos = f""" 6347 UPDATE {table_variants} as table_variants 6348 SET INFO = 6349 concat( 6350 CASE WHEN table_variants.INFO NOT IN ('','.') 6351 THEN table_variants.INFO 6352 ELSE '' 6353 END 6354 , 6355 CASE WHEN table_variants.INFO NOT IN ('','.') 6356 AND ( 6357 concat({sql_query_annotation_update_info_sets_sql}) 6358 ) 6359 NOT IN ('','.') 6360 THEN ';' 6361 ELSE '' 6362 END 6363 , 6364 {sql_query_annotation_update_info_sets_sql} 6365 ) 6366 {sql_query_annotation_from_clause} 6367 WHERE {sql_query_annotation_where_clause} 6368 ; 6369 """ 6370 6371 # Add update query to dict 6372 query_dict[ 6373 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6374 ] = sql_query_annotation_chrom_interval_pos 6375 6376 nb_of_query = len(query_dict) 6377 num_query = 0 6378 6379 # SET max_expression_depth TO x 6380 self.conn.execute("SET max_expression_depth TO 10000") 6381 6382 for query_name in query_dict: 6383 query = query_dict[query_name] 6384 num_query += 1 6385 log.info( 6386 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6387 ) 6388 result = self.conn.execute(query) 6389 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6390 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6391 log.info( 6392 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6393 ) 6394 6395 log.info( 6396 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6397 ) 6398 6399 else: 6400 6401 log.info( 6402 f"Annotation '{annotation_name}' - No Annotations available" 6403 ) 6404 6405 log.debug("Final header: " + str(vcf_reader.infos)) 6406 6407 # Remove added columns 6408 for added_column in added_columns: 6409 self.drop_column(column=added_column) 6410 6411 def annotation_splice(self, threads: int = None) -> None: 6412 """ 6413 This function annotate with snpEff 6414 6415 :param threads: The number of threads to use 6416 :return: the value of the variable "return_value". 6417 """ 6418 6419 # DEBUG 6420 log.debug("Start annotation with splice tools") 6421 6422 # Threads 6423 if not threads: 6424 threads = self.get_threads() 6425 log.debug("Threads: " + str(threads)) 6426 6427 # DEBUG 6428 delete_tmp = True 6429 if self.get_config().get("verbosity", "warning") in ["debug"]: 6430 delete_tmp = False 6431 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6432 6433 # Config 6434 config = self.get_config() 6435 log.debug("Config: " + str(config)) 6436 splice_config = config.get("tools", {}).get("splice", {}) 6437 if not splice_config: 6438 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6439 msg_err = "No Splice tool config" 6440 raise ValueError(msg_err) 6441 log.debug(f"splice_config: {splice_config}") 6442 6443 # Config - Folders - Databases 6444 databases_folders = ( 6445 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6446 ) 6447 log.debug("Databases annotations: " + str(databases_folders)) 6448 6449 # Splice docker image 6450 splice_docker_image = splice_config.get("docker").get("image") 6451 6452 # Pull splice image if it's not already there 6453 if not check_docker_image_exists(splice_docker_image): 6454 log.warning( 6455 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6456 ) 6457 try: 6458 command(f"docker pull {splice_config.get('docker').get('image')}") 6459 except subprocess.CalledProcessError: 6460 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6461 log.error(msg_err) 6462 raise ValueError(msg_err) 6463 6464 # Config - splice databases 6465 splice_databases = ( 6466 config.get("folders", {}) 6467 .get("databases", {}) 6468 .get("splice", DEFAULT_SPLICE_FOLDER) 6469 ) 6470 splice_databases = full_path(splice_databases) 6471 6472 # Param 6473 param = self.get_param() 6474 log.debug("Param: " + str(param)) 6475 6476 # Param 6477 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6478 log.debug("Options: " + str(options)) 6479 6480 # Data 6481 table_variants = self.get_table_variants() 6482 6483 # Check if not empty 6484 log.debug("Check if not empty") 6485 sql_query_chromosomes = ( 6486 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6487 ) 6488 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6489 log.info("VCF empty") 6490 return None 6491 6492 # Export in VCF 6493 log.debug("Create initial file to annotate") 6494 6495 # Create output folder / work folder 6496 if options.get("output_folder", ""): 6497 output_folder = options.get("output_folder", "") 6498 if not os.path.exists(output_folder): 6499 Path(output_folder).mkdir(parents=True, exist_ok=True) 6500 else: 6501 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6502 if not os.path.exists(output_folder): 6503 Path(output_folder).mkdir(parents=True, exist_ok=True) 6504 6505 if options.get("workdir", ""): 6506 workdir = options.get("workdir", "") 6507 else: 6508 workdir = "/work" 6509 6510 # Create tmp VCF file 6511 tmp_vcf = NamedTemporaryFile( 6512 prefix=self.get_prefix(), 6513 dir=output_folder, 6514 suffix=".vcf", 6515 delete=False, 6516 ) 6517 tmp_vcf_name = tmp_vcf.name 6518 6519 # VCF header 6520 header = self.get_header() 6521 6522 # Existing annotations 6523 for vcf_annotation in self.get_header().infos: 6524 6525 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6526 log.debug( 6527 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6528 ) 6529 6530 # Memory limit 6531 if config.get("memory", None): 6532 memory_limit = config.get("memory", "8G").upper() 6533 # upper() 6534 else: 6535 memory_limit = "8G" 6536 log.debug(f"memory_limit: {memory_limit}") 6537 6538 # Check number of variants to annotate 6539 where_clause_regex_spliceai = r"SpliceAI_\w+" 6540 where_clause_regex_spip = r"SPiP_\w+" 6541 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6542 df_list_of_variants_to_annotate = self.get_query_to_df( 6543 query=f""" SELECT * FROM variants {where_clause} """ 6544 ) 6545 if len(df_list_of_variants_to_annotate) == 0: 6546 log.warning( 6547 f"No variants to annotate with splice. Variants probably already annotated with splice" 6548 ) 6549 return None 6550 else: 6551 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6552 6553 # Export VCF file 6554 self.export_variant_vcf( 6555 vcf_file=tmp_vcf_name, 6556 remove_info=True, 6557 add_samples=True, 6558 index=False, 6559 where_clause=where_clause, 6560 ) 6561 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6562 if any(value for value in splice_config.values() if value is None): 6563 log.warning("At least one splice config parameter is empty") 6564 # exit annotation_splice 6565 return None 6566 6567 # Params in splice nf 6568 def check_values(dico: dict): 6569 """ 6570 Ensure parameters for NF splice pipeline 6571 """ 6572 for key, val in dico.items(): 6573 if key == "genome": 6574 if any( 6575 assemb in options.get("genome", {}) 6576 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6577 ): 6578 yield f"--{key} hg19" 6579 elif any( 6580 assemb in options.get("genome", {}) 6581 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6582 ): 6583 yield f"--{key} hg38" 6584 elif ( 6585 (isinstance(val, str) and val) 6586 or isinstance(val, int) 6587 or isinstance(val, bool) 6588 ): 6589 yield f"--{key} {val}" 6590 6591 # Genome 6592 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6593 options["genome"] = genome 6594 # NF params 6595 nf_params = [] 6596 # Add options 6597 if options: 6598 log.debug(options) 6599 nf_params = list(check_values(options)) 6600 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6601 else: 6602 log.debug("No NF params provided") 6603 # Add threads 6604 if "threads" not in options.keys(): 6605 nf_params.append(f"--threads {threads}") 6606 # Genome path 6607 genome_path = find_genome( 6608 config.get("folders", {}) 6609 .get("databases", {}) 6610 .get("genomes", DEFAULT_GENOME_FOLDER), 6611 file=f"{genome}.fa", 6612 ) 6613 # Add genome path 6614 if not genome_path: 6615 raise ValueError( 6616 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6617 ) 6618 else: 6619 log.debug(f"Genome: {genome_path}") 6620 nf_params.append(f"--genome_path {genome_path}") 6621 6622 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6623 """ 6624 Setting up updated databases for SPiP and SpliceAI 6625 """ 6626 6627 try: 6628 6629 # SpliceAI assembly transcriptome 6630 spliceai_assembly = os.path.join( 6631 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6632 options.get("genome"), 6633 "transcriptome", 6634 ) 6635 spip_assembly = options.get("genome") 6636 6637 spip = find( 6638 f"transcriptome_{spip_assembly}.RData", 6639 config.get("folders", {}).get("databases", {}).get("spip", {}), 6640 ) 6641 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6642 log.debug(f"SPiP annotations: {spip}") 6643 log.debug(f"SpliceAI annotations: {spliceai}") 6644 if spip and spliceai: 6645 return [ 6646 f"--spip_transcriptome {spip}", 6647 f"--spliceai_transcriptome {spliceai}", 6648 ] 6649 else: 6650 log.warning( 6651 "Can't find splice databases in configuration, use annotations file from image" 6652 ) 6653 except TypeError: 6654 log.warning( 6655 "Can't find splice databases in configuration, use annotations file from image" 6656 ) 6657 return [] 6658 6659 # Add options, check if transcriptome option have already beend provided 6660 if ( 6661 "spip_transcriptome" not in nf_params 6662 and "spliceai_transcriptome" not in nf_params 6663 ): 6664 splice_reference = splice_annotations(options, config) 6665 if splice_reference: 6666 nf_params.extend(splice_reference) 6667 # nf_params.append(f"--output_folder {output_folder}") 6668 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6669 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6670 log.debug(cmd) 6671 splice_config["docker"]["command"] = cmd 6672 6673 # Ensure proxy is set 6674 proxy = [ 6675 f"-e {var}={os.getenv(var)}" 6676 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6677 if os.getenv(var) is not None 6678 ] 6679 docker_cmd = get_bin_command( 6680 tool="splice", 6681 bin_type="docker", 6682 config=config, 6683 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6684 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6685 ) 6686 # print(docker_cmd) 6687 # exit() 6688 # Docker debug 6689 # if splice_config.get("rm_container"): 6690 # rm_container = "--rm" 6691 # else: 6692 # rm_container = "" 6693 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6694 log.debug(docker_cmd) 6695 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6696 log.debug(res.stdout) 6697 if res.stderr: 6698 log.error(res.stderr) 6699 res.check_returncode() 6700 # Update variants 6701 log.info("Annotation - Updating...") 6702 # Test find output vcf 6703 log.debug( 6704 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6705 ) 6706 output_vcf = [] 6707 # Wrong folder to look in 6708 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6709 if ( 6710 files 6711 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6712 ): 6713 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6714 # log.debug(os.listdir(options.get("output_folder"))) 6715 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6716 if not output_vcf: 6717 log.debug( 6718 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6719 ) 6720 else: 6721 # Get new header from annotated vcf 6722 log.debug(f"Initial header: {len(header.infos)} fields") 6723 # Create new header with splice infos 6724 new_vcf = Variants(input=output_vcf[0]) 6725 new_vcf_header = new_vcf.get_header().infos 6726 for keys, infos in new_vcf_header.items(): 6727 if keys not in header.infos.keys(): 6728 header.infos[keys] = infos 6729 log.debug(f"New header: {len(header.infos)} fields") 6730 log.debug(f"Splice tmp output: {output_vcf[0]}") 6731 self.update_from_vcf(output_vcf[0]) 6732 6733 # Remove file 6734 remove_if_exists(output_vcf) 6735 6736 ### 6737 # Prioritization 6738 ### 6739 6740 def get_config_default(self, name: str) -> dict: 6741 """ 6742 The function `get_config_default` returns a dictionary containing default configurations for 6743 various calculations and prioritizations. 6744 6745 :param name: The `get_config_default` function returns a dictionary containing default 6746 configurations for different calculations and prioritizations. The `name` parameter is used to 6747 specify which specific configuration to retrieve from the dictionary 6748 :type name: str 6749 :return: The function `get_config_default` returns a dictionary containing default configuration 6750 settings for different calculations and prioritizations. The specific configuration settings are 6751 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6752 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6753 returned. If there is no match, an empty dictionary is returned. 6754 """ 6755 6756 config_default = { 6757 "calculations": { 6758 "variant_chr_pos_alt_ref": { 6759 "type": "sql", 6760 "name": "variant_chr_pos_alt_ref", 6761 "description": "Create a variant ID with chromosome, position, alt and ref", 6762 "available": False, 6763 "output_column_name": "variant_chr_pos_alt_ref", 6764 "output_column_type": "String", 6765 "output_column_description": "variant ID with chromosome, position, alt and ref", 6766 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6767 "operation_info": True, 6768 }, 6769 "VARTYPE": { 6770 "type": "sql", 6771 "name": "VARTYPE", 6772 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6773 "available": True, 6774 "table": "variants", 6775 "output_column_name": "VARTYPE", 6776 "output_column_type": "String", 6777 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6778 "operation_query": """ 6779 CASE 6780 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6781 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6782 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6783 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6784 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6785 ELSE 'UNDEFINED' 6786 END 6787 """, 6788 "info_fields": ["SVTYPE"], 6789 "operation_info": True, 6790 }, 6791 "snpeff_hgvs": { 6792 "type": "python", 6793 "name": "snpeff_hgvs", 6794 "description": "HGVS nomenclatures from snpEff annotation", 6795 "available": True, 6796 "function_name": "calculation_extract_snpeff_hgvs", 6797 "function_params": ["snpeff_hgvs", "ANN"], 6798 }, 6799 "snpeff_ann_explode": { 6800 "type": "python", 6801 "name": "snpeff_ann_explode", 6802 "description": "Explode snpEff annotations with uniquify values", 6803 "available": True, 6804 "function_name": "calculation_snpeff_ann_explode", 6805 "function_params": [False, "fields", "snpeff_", "ANN"], 6806 }, 6807 "snpeff_ann_explode_uniquify": { 6808 "type": "python", 6809 "name": "snpeff_ann_explode_uniquify", 6810 "description": "Explode snpEff annotations", 6811 "available": True, 6812 "function_name": "calculation_snpeff_ann_explode", 6813 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6814 }, 6815 "snpeff_ann_explode_json": { 6816 "type": "python", 6817 "name": "snpeff_ann_explode_json", 6818 "description": "Explode snpEff annotations in JSON format", 6819 "available": True, 6820 "function_name": "calculation_snpeff_ann_explode", 6821 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6822 }, 6823 "NOMEN": { 6824 "type": "python", 6825 "name": "NOMEN", 6826 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6827 "available": True, 6828 "function_name": "calculation_extract_nomen", 6829 "function_params": [], 6830 }, 6831 "FINDBYPIPELINE": { 6832 "type": "python", 6833 "name": "FINDBYPIPELINE", 6834 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6835 "available": True, 6836 "function_name": "calculation_find_by_pipeline", 6837 "function_params": ["findbypipeline"], 6838 }, 6839 "FINDBYSAMPLE": { 6840 "type": "python", 6841 "name": "FINDBYSAMPLE", 6842 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6843 "available": True, 6844 "function_name": "calculation_find_by_pipeline", 6845 "function_params": ["findbysample"], 6846 }, 6847 "GENOTYPECONCORDANCE": { 6848 "type": "python", 6849 "name": "GENOTYPECONCORDANCE", 6850 "description": "Concordance of genotype for multi caller VCF", 6851 "available": True, 6852 "function_name": "calculation_genotype_concordance", 6853 "function_params": [], 6854 }, 6855 "BARCODE": { 6856 "type": "python", 6857 "name": "BARCODE", 6858 "description": "BARCODE as VaRank tool", 6859 "available": True, 6860 "function_name": "calculation_barcode", 6861 "function_params": [], 6862 }, 6863 "BARCODEFAMILY": { 6864 "type": "python", 6865 "name": "BARCODEFAMILY", 6866 "description": "BARCODEFAMILY as VaRank tool", 6867 "available": True, 6868 "function_name": "calculation_barcode_family", 6869 "function_params": ["BCF"], 6870 }, 6871 "TRIO": { 6872 "type": "python", 6873 "name": "TRIO", 6874 "description": "Inheritance for a trio family", 6875 "available": True, 6876 "function_name": "calculation_trio", 6877 "function_params": [], 6878 }, 6879 "VAF": { 6880 "type": "python", 6881 "name": "VAF", 6882 "description": "Variant Allele Frequency (VAF) harmonization", 6883 "available": True, 6884 "function_name": "calculation_vaf_normalization", 6885 "function_params": [], 6886 }, 6887 "VAF_stats": { 6888 "type": "python", 6889 "name": "VAF_stats", 6890 "description": "Variant Allele Frequency (VAF) statistics", 6891 "available": True, 6892 "function_name": "calculation_genotype_stats", 6893 "function_params": ["VAF"], 6894 }, 6895 "DP_stats": { 6896 "type": "python", 6897 "name": "DP_stats", 6898 "description": "Depth (DP) statistics", 6899 "available": True, 6900 "function_name": "calculation_genotype_stats", 6901 "function_params": ["DP"], 6902 }, 6903 "variant_id": { 6904 "type": "python", 6905 "name": "variant_id", 6906 "description": "Variant ID generated from variant position and type", 6907 "available": True, 6908 "function_name": "calculation_variant_id", 6909 "function_params": [], 6910 }, 6911 "transcripts_json": { 6912 "type": "python", 6913 "name": "transcripts_json", 6914 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6915 "available": True, 6916 "function_name": "calculation_transcripts_annotation", 6917 "function_params": ["transcripts_json", None], 6918 }, 6919 "transcripts_ann": { 6920 "type": "python", 6921 "name": "transcripts_ann", 6922 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6923 "available": True, 6924 "function_name": "calculation_transcripts_annotation", 6925 "function_params": [None, "transcripts_ann"], 6926 }, 6927 "transcripts_annotations": { 6928 "type": "python", 6929 "name": "transcripts_annotations", 6930 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6931 "available": True, 6932 "function_name": "calculation_transcripts_annotation", 6933 "function_params": [None, None], 6934 }, 6935 "transcripts_prioritization": { 6936 "type": "python", 6937 "name": "transcripts_prioritization", 6938 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6939 "available": True, 6940 "function_name": "calculation_transcripts_prioritization", 6941 "function_params": [], 6942 }, 6943 "transcripts_export": { 6944 "type": "python", 6945 "name": "transcripts_export", 6946 "description": "Export transcripts table/view as a file (using param.json)", 6947 "available": True, 6948 "function_name": "calculation_transcripts_export", 6949 "function_params": [], 6950 }, 6951 }, 6952 "prioritizations": { 6953 "default": { 6954 "ANN2": [ 6955 { 6956 "type": "contains", 6957 "value": "HIGH", 6958 "score": 5, 6959 "flag": "PASS", 6960 "comment": [ 6961 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6962 ], 6963 }, 6964 { 6965 "type": "contains", 6966 "value": "MODERATE", 6967 "score": 3, 6968 "flag": "PASS", 6969 "comment": [ 6970 "A non-disruptive variant that might change protein effectiveness" 6971 ], 6972 }, 6973 { 6974 "type": "contains", 6975 "value": "LOW", 6976 "score": 0, 6977 "flag": "FILTERED", 6978 "comment": [ 6979 "Assumed to be mostly harmless or unlikely to change protein behavior" 6980 ], 6981 }, 6982 { 6983 "type": "contains", 6984 "value": "MODIFIER", 6985 "score": 0, 6986 "flag": "FILTERED", 6987 "comment": [ 6988 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6989 ], 6990 }, 6991 ], 6992 } 6993 }, 6994 } 6995 6996 return config_default.get(name, None) 6997 6998 def get_config_json( 6999 self, name: str, config_dict: dict = {}, config_file: str = None 7000 ) -> dict: 7001 """ 7002 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7003 default values, a dictionary, and a file. 7004 7005 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7006 the name of the configuration. It is used to identify and retrieve the configuration settings 7007 for a specific component or module 7008 :type name: str 7009 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7010 dictionary that allows you to provide additional configuration settings or overrides. When you 7011 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7012 the key is the configuration setting you want to override or 7013 :type config_dict: dict 7014 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7015 specify the path to a configuration file that contains additional settings. If provided, the 7016 function will read the contents of this file and update the configuration dictionary with the 7017 values found in the file, overriding any existing values with the 7018 :type config_file: str 7019 :return: The function `get_config_json` returns a dictionary containing the configuration 7020 settings. 7021 """ 7022 7023 # Create with default prioritizations 7024 config_default = self.get_config_default(name=name) 7025 configuration = config_default 7026 # log.debug(f"configuration={configuration}") 7027 7028 # Replace prioritizations from dict 7029 for config in config_dict: 7030 configuration[config] = config_dict[config] 7031 7032 # Replace prioritizations from file 7033 config_file = full_path(config_file) 7034 if config_file: 7035 if os.path.exists(config_file): 7036 with open(config_file) as config_file_content: 7037 config_file_dict = json.load(config_file_content) 7038 for config in config_file_dict: 7039 configuration[config] = config_file_dict[config] 7040 else: 7041 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7042 log.error(msg_error) 7043 raise ValueError(msg_error) 7044 7045 return configuration 7046 7047 def prioritization( 7048 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7049 ) -> bool: 7050 """ 7051 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7052 prioritizes variants based on configured profiles and criteria. 7053 7054 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7055 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7056 a table name is provided, the method will prioritize the variants in that specific table 7057 :type table: str 7058 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7059 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7060 provided, the code will use a default prefix value of "PZ" 7061 :type pz_prefix: str 7062 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7063 additional parameters specific to the prioritization process. These parameters can include 7064 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7065 configurations needed for the prioritization of variants in a V 7066 :type pz_param: dict 7067 :return: A boolean value (True) is being returned from the `prioritization` function. 7068 """ 7069 7070 # Config 7071 config = self.get_config() 7072 7073 # Param 7074 param = self.get_param() 7075 7076 # Prioritization param 7077 if pz_param is not None: 7078 prioritization_param = pz_param 7079 else: 7080 prioritization_param = param.get("prioritization", {}) 7081 7082 # Configuration profiles 7083 prioritization_config_file = prioritization_param.get( 7084 "prioritization_config", None 7085 ) 7086 prioritization_config_file = full_path(prioritization_config_file) 7087 prioritizations_config = self.get_config_json( 7088 name="prioritizations", config_file=prioritization_config_file 7089 ) 7090 7091 # Prioritization prefix 7092 pz_prefix_default = "PZ" 7093 if pz_prefix is None: 7094 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7095 7096 # Prioritization options 7097 profiles = prioritization_param.get("profiles", []) 7098 if isinstance(profiles, str): 7099 profiles = profiles.split(",") 7100 pzfields = prioritization_param.get( 7101 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7102 ) 7103 if isinstance(pzfields, str): 7104 pzfields = pzfields.split(",") 7105 default_profile = prioritization_param.get("default_profile", None) 7106 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7107 prioritization_score_mode = prioritization_param.get( 7108 "prioritization_score_mode", "HOWARD" 7109 ) 7110 7111 # Quick Prioritizations 7112 prioritizations = param.get("prioritizations", None) 7113 if prioritizations: 7114 log.info("Quick Prioritization:") 7115 for profile in prioritizations.split(","): 7116 if profile not in profiles: 7117 profiles.append(profile) 7118 log.info(f" {profile}") 7119 7120 # If profile "ALL" provided, all profiles in the config profiles 7121 if "ALL" in profiles: 7122 profiles = list(prioritizations_config.keys()) 7123 7124 for profile in profiles: 7125 if prioritizations_config.get(profile, None): 7126 log.debug(f"Profile '{profile}' configured") 7127 else: 7128 msg_error = f"Profile '{profile}' NOT configured" 7129 log.error(msg_error) 7130 raise ValueError(msg_error) 7131 7132 if profiles: 7133 log.info(f"Prioritization... ") 7134 else: 7135 log.debug(f"No profile defined") 7136 return False 7137 7138 if not default_profile and len(profiles): 7139 default_profile = profiles[0] 7140 7141 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7142 log.debug("Profiles to check: " + str(list(profiles))) 7143 7144 # Variables 7145 if table is not None: 7146 table_variants = table 7147 else: 7148 table_variants = self.get_table_variants(clause="update") 7149 log.debug(f"Table to prioritize: {table_variants}") 7150 7151 # Added columns 7152 added_columns = [] 7153 7154 # Create list of PZfields 7155 # List of PZFields 7156 list_of_pzfields_original = pzfields + [ 7157 pzfield + pzfields_sep + profile 7158 for pzfield in pzfields 7159 for profile in profiles 7160 ] 7161 list_of_pzfields = [] 7162 log.debug(f"{list_of_pzfields_original}") 7163 7164 # Remove existing PZfields to use if exists 7165 for pzfield in list_of_pzfields_original: 7166 if self.get_header().infos.get(pzfield, None) is None: 7167 list_of_pzfields.append(pzfield) 7168 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7169 else: 7170 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7171 7172 if list_of_pzfields: 7173 7174 # Explode Infos prefix 7175 explode_infos_prefix = self.get_explode_infos_prefix() 7176 7177 # PZfields tags description 7178 PZfields_INFOS = { 7179 f"{pz_prefix}Tags": { 7180 "ID": f"{pz_prefix}Tags", 7181 "Number": ".", 7182 "Type": "String", 7183 "Description": "Variant tags based on annotation criteria", 7184 }, 7185 f"{pz_prefix}Score": { 7186 "ID": f"{pz_prefix}Score", 7187 "Number": 1, 7188 "Type": "Integer", 7189 "Description": "Variant score based on annotation criteria", 7190 }, 7191 f"{pz_prefix}Flag": { 7192 "ID": f"{pz_prefix}Flag", 7193 "Number": 1, 7194 "Type": "String", 7195 "Description": "Variant flag based on annotation criteria", 7196 }, 7197 f"{pz_prefix}Comment": { 7198 "ID": f"{pz_prefix}Comment", 7199 "Number": ".", 7200 "Type": "String", 7201 "Description": "Variant comment based on annotation criteria", 7202 }, 7203 f"{pz_prefix}Infos": { 7204 "ID": f"{pz_prefix}Infos", 7205 "Number": ".", 7206 "Type": "String", 7207 "Description": "Variant infos based on annotation criteria", 7208 }, 7209 f"{pz_prefix}Class": { 7210 "ID": f"{pz_prefix}Class", 7211 "Number": ".", 7212 "Type": "String", 7213 "Description": "Variant class based on annotation criteria", 7214 }, 7215 } 7216 7217 # Create INFO fields if not exist 7218 for field in PZfields_INFOS: 7219 field_ID = PZfields_INFOS[field]["ID"] 7220 field_description = PZfields_INFOS[field]["Description"] 7221 if field_ID not in self.get_header().infos and field_ID in pzfields: 7222 field_description = ( 7223 PZfields_INFOS[field]["Description"] 7224 + f", profile {default_profile}" 7225 ) 7226 self.get_header().infos[field_ID] = vcf.parser._Info( 7227 field_ID, 7228 PZfields_INFOS[field]["Number"], 7229 PZfields_INFOS[field]["Type"], 7230 field_description, 7231 "unknown", 7232 "unknown", 7233 code_type_map[PZfields_INFOS[field]["Type"]], 7234 ) 7235 7236 # Create INFO fields if not exist for each profile 7237 for profile in prioritizations_config: 7238 if profile in profiles or profiles == []: 7239 for field in PZfields_INFOS: 7240 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7241 field_description = ( 7242 PZfields_INFOS[field]["Description"] 7243 + f", profile {profile}" 7244 ) 7245 if ( 7246 field_ID not in self.get_header().infos 7247 and field in pzfields 7248 ): 7249 self.get_header().infos[field_ID] = vcf.parser._Info( 7250 field_ID, 7251 PZfields_INFOS[field]["Number"], 7252 PZfields_INFOS[field]["Type"], 7253 field_description, 7254 "unknown", 7255 "unknown", 7256 code_type_map[PZfields_INFOS[field]["Type"]], 7257 ) 7258 7259 # Header 7260 for pzfield in list_of_pzfields: 7261 if re.match(f"{pz_prefix}Score.*", pzfield): 7262 added_column = self.add_column( 7263 table_name=table_variants, 7264 column_name=pzfield, 7265 column_type="INTEGER", 7266 default_value="0", 7267 ) 7268 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7269 added_column = self.add_column( 7270 table_name=table_variants, 7271 column_name=pzfield, 7272 column_type="BOOLEAN", 7273 default_value="1", 7274 ) 7275 elif re.match(f"{pz_prefix}Class.*", pzfield): 7276 added_column = self.add_column( 7277 table_name=table_variants, 7278 column_name=pzfield, 7279 column_type="VARCHAR[]", 7280 default_value="null", 7281 ) 7282 else: 7283 added_column = self.add_column( 7284 table_name=table_variants, 7285 column_name=pzfield, 7286 column_type="STRING", 7287 default_value="''", 7288 ) 7289 added_columns.append(added_column) 7290 7291 # Profiles 7292 if profiles: 7293 7294 # foreach profile in configuration file 7295 for profile in prioritizations_config: 7296 7297 # If profile is asked in param, or ALL are asked (empty profile []) 7298 if profile in profiles or profiles == []: 7299 log.info(f"Profile '{profile}'") 7300 7301 sql_set_info_option = "" 7302 7303 sql_set_info = [] 7304 7305 # PZ fields set 7306 7307 # PZScore 7308 if ( 7309 f"{pz_prefix}Score{pzfields_sep}{profile}" 7310 in list_of_pzfields 7311 ): 7312 sql_set_info.append( 7313 f""" 7314 concat( 7315 '{pz_prefix}Score{pzfields_sep}{profile}=', 7316 {pz_prefix}Score{pzfields_sep}{profile} 7317 ) 7318 """ 7319 ) 7320 if ( 7321 profile == default_profile 7322 and f"{pz_prefix}Score" in list_of_pzfields 7323 ): 7324 sql_set_info.append( 7325 f""" 7326 concat( 7327 '{pz_prefix}Score=', 7328 {pz_prefix}Score{pzfields_sep}{profile} 7329 ) 7330 """ 7331 ) 7332 7333 # PZFlag 7334 if ( 7335 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7336 in list_of_pzfields 7337 ): 7338 sql_set_info.append( 7339 f""" 7340 concat( 7341 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7342 CASE 7343 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7344 THEN 'PASS' 7345 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7346 THEN 'FILTERED' 7347 END 7348 ) 7349 """ 7350 ) 7351 if ( 7352 profile == default_profile 7353 and f"{pz_prefix}Flag" in list_of_pzfields 7354 ): 7355 sql_set_info.append( 7356 f""" 7357 concat( 7358 '{pz_prefix}Flag=', 7359 CASE 7360 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7361 THEN 'PASS' 7362 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7363 THEN 'FILTERED' 7364 END 7365 ) 7366 """ 7367 ) 7368 7369 # PZClass 7370 if ( 7371 f"{pz_prefix}Class{pzfields_sep}{profile}" 7372 in list_of_pzfields 7373 ): 7374 sql_set_info.append( 7375 f""" 7376 concat( 7377 '{pz_prefix}Class{pzfields_sep}{profile}=', 7378 CASE 7379 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7380 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7381 ELSE '.' 7382 END 7383 ) 7384 7385 """ 7386 ) 7387 if ( 7388 profile == default_profile 7389 and f"{pz_prefix}Class" in list_of_pzfields 7390 ): 7391 sql_set_info.append( 7392 f""" 7393 concat( 7394 '{pz_prefix}Class=', 7395 CASE 7396 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7397 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7398 ELSE '.' 7399 END 7400 ) 7401 """ 7402 ) 7403 7404 # PZComment 7405 if ( 7406 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7407 in list_of_pzfields 7408 ): 7409 sql_set_info.append( 7410 f""" 7411 CASE 7412 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7413 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7414 ELSE '' 7415 END 7416 """ 7417 ) 7418 if ( 7419 profile == default_profile 7420 and f"{pz_prefix}Comment" in list_of_pzfields 7421 ): 7422 sql_set_info.append( 7423 f""" 7424 CASE 7425 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7426 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7427 ELSE '' 7428 END 7429 """ 7430 ) 7431 7432 # PZInfos 7433 if ( 7434 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7435 in list_of_pzfields 7436 ): 7437 sql_set_info.append( 7438 f""" 7439 CASE 7440 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7441 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7442 ELSE '' 7443 END 7444 """ 7445 ) 7446 if ( 7447 profile == default_profile 7448 and f"{pz_prefix}Infos" in list_of_pzfields 7449 ): 7450 sql_set_info.append( 7451 f""" 7452 CASE 7453 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7454 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7455 ELSE '' 7456 END 7457 """ 7458 ) 7459 7460 # Merge PZfields 7461 sql_set_info_option = "" 7462 sql_set_sep = "" 7463 for sql_set in sql_set_info: 7464 if sql_set_sep: 7465 sql_set_info_option += f""" 7466 , concat('{sql_set_sep}', {sql_set}) 7467 """ 7468 else: 7469 sql_set_info_option += f""" 7470 , {sql_set} 7471 """ 7472 sql_set_sep = ";" 7473 7474 sql_queries = [] 7475 for annotation in prioritizations_config[profile]: 7476 7477 # skip special sections 7478 if annotation.startswith("_"): 7479 continue 7480 7481 # For each criterions 7482 for criterion in prioritizations_config[profile][ 7483 annotation 7484 ]: 7485 7486 # Criterion mode 7487 criterion_mode = None 7488 if np.any( 7489 np.isin(list(criterion.keys()), ["type", "value"]) 7490 ): 7491 criterion_mode = "operation" 7492 elif np.any( 7493 np.isin(list(criterion.keys()), ["sql", "fields"]) 7494 ): 7495 criterion_mode = "sql" 7496 log.debug(f"Criterion Mode: {criterion_mode}") 7497 7498 # Criterion parameters 7499 criterion_type = criterion.get("type", None) 7500 criterion_value = criterion.get("value", None) 7501 criterion_sql = criterion.get("sql", None) 7502 criterion_fields = criterion.get("fields", None) 7503 criterion_score = criterion.get("score", 0) 7504 criterion_flag = criterion.get("flag", "PASS") 7505 criterion_class = criterion.get("class", None) 7506 criterion_flag_bool = criterion_flag == "PASS" 7507 criterion_comment = ( 7508 ", ".join(criterion.get("comment", [])) 7509 .replace("'", "''") 7510 .replace(";", ",") 7511 .replace("\t", " ") 7512 ) 7513 criterion_infos = ( 7514 str(criterion) 7515 .replace("'", "''") 7516 .replace(";", ",") 7517 .replace("\t", " ") 7518 ) 7519 7520 # SQL 7521 if criterion_sql is not None and isinstance( 7522 criterion_sql, list 7523 ): 7524 criterion_sql = " ".join(criterion_sql) 7525 7526 # Fields and explode 7527 if criterion_fields is None: 7528 criterion_fields = [annotation] 7529 if not isinstance(criterion_fields, list): 7530 criterion_fields = str(criterion_fields).split(",") 7531 7532 # Class 7533 if criterion_class is not None and not isinstance( 7534 criterion_class, list 7535 ): 7536 criterion_class = str(criterion_class).split(",") 7537 7538 for annotation_field in criterion_fields: 7539 7540 # Explode specific annotation 7541 log.debug( 7542 f"Explode annotation '{annotation_field}'" 7543 ) 7544 added_columns += self.explode_infos( 7545 prefix=explode_infos_prefix, 7546 fields=[annotation_field], 7547 table=table_variants, 7548 ) 7549 extra_infos = self.get_extra_infos( 7550 table=table_variants 7551 ) 7552 7553 # Check if annotation field is present 7554 if ( 7555 f"{explode_infos_prefix}{annotation_field}" 7556 not in extra_infos 7557 ): 7558 msq_err = f"Annotation '{annotation_field}' not in data" 7559 log.error(msq_err) 7560 raise ValueError(msq_err) 7561 else: 7562 log.debug( 7563 f"Annotation '{annotation_field}' in data" 7564 ) 7565 7566 sql_set = [] 7567 sql_set_info = [] 7568 7569 # PZ fields set 7570 7571 # PZScore 7572 if ( 7573 f"{pz_prefix}Score{pzfields_sep}{profile}" 7574 in list_of_pzfields 7575 ): 7576 # if prioritization_score_mode == "HOWARD": 7577 # sql_set.append( 7578 # f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7579 # ) 7580 # VaRank prioritization score mode 7581 if prioritization_score_mode == "VaRank": 7582 sql_set.append( 7583 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7584 ) 7585 # default HOWARD prioritization score mode 7586 else: 7587 sql_set.append( 7588 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7589 ) 7590 7591 # PZFlag 7592 if ( 7593 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7594 in list_of_pzfields 7595 ): 7596 sql_set.append( 7597 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7598 ) 7599 7600 # PZClass 7601 if ( 7602 f"{pz_prefix}Class{pzfields_sep}{profile}" 7603 in list_of_pzfields 7604 and criterion_class is not None 7605 ): 7606 sql_set.append( 7607 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7608 ) 7609 7610 # PZComment 7611 if ( 7612 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7613 in list_of_pzfields 7614 ): 7615 sql_set.append( 7616 f""" 7617 {pz_prefix}Comment{pzfields_sep}{profile} = 7618 concat( 7619 {pz_prefix}Comment{pzfields_sep}{profile}, 7620 CASE 7621 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7622 THEN ', ' 7623 ELSE '' 7624 END, 7625 '{criterion_comment}' 7626 ) 7627 """ 7628 ) 7629 7630 # PZInfos 7631 if ( 7632 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7633 in list_of_pzfields 7634 ): 7635 sql_set.append( 7636 f""" 7637 {pz_prefix}Infos{pzfields_sep}{profile} = 7638 concat( 7639 {pz_prefix}Infos{pzfields_sep}{profile}, 7640 '{criterion_infos}' 7641 ) 7642 """ 7643 ) 7644 sql_set_option = ",".join(sql_set) 7645 7646 # Criterion and comparison 7647 if sql_set_option: 7648 7649 if criterion_mode in ["operation"]: 7650 7651 try: 7652 float(criterion_value) 7653 sql_update = f""" 7654 UPDATE {table_variants} 7655 SET {sql_set_option} 7656 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7657 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7658 """ 7659 except: 7660 contains_option = "" 7661 if criterion_type == "contains": 7662 contains_option = ".*" 7663 sql_update = f""" 7664 UPDATE {table_variants} 7665 SET {sql_set_option} 7666 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7667 """ 7668 sql_queries.append(sql_update) 7669 7670 elif criterion_mode in ["sql"]: 7671 7672 sql_update = f""" 7673 UPDATE {table_variants} 7674 SET {sql_set_option} 7675 WHERE {criterion_sql} 7676 """ 7677 sql_queries.append(sql_update) 7678 7679 else: 7680 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7681 log.error(msg_err) 7682 raise ValueError(msg_err) 7683 7684 else: 7685 log.warning( 7686 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7687 ) 7688 7689 # PZTags 7690 if ( 7691 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7692 in list_of_pzfields 7693 ): 7694 7695 # Create PZFalgs value 7696 pztags_value = "" 7697 pztags_sep_default = "," 7698 pztags_sep = "" 7699 for pzfield in pzfields: 7700 if pzfield not in [f"{pz_prefix}Tags"]: 7701 if ( 7702 f"{pzfield}{pzfields_sep}{profile}" 7703 in list_of_pzfields 7704 ): 7705 if pzfield in [f"{pz_prefix}Flag"]: 7706 pztags_value += f"""{pztags_sep}{pzfield}#', 7707 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7708 THEN 'PASS' 7709 ELSE 'FILTERED' 7710 END, '""" 7711 elif pzfield in [f"{pz_prefix}Class"]: 7712 pztags_value += f"""{pztags_sep}{pzfield}#', 7713 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7714 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7715 ELSE '.' 7716 END, '""" 7717 else: 7718 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7719 pztags_sep = pztags_sep_default 7720 7721 # Add Query update for PZFlags 7722 sql_update_pztags = f""" 7723 UPDATE {table_variants} 7724 SET INFO = concat( 7725 INFO, 7726 CASE WHEN INFO NOT in ('','.') 7727 THEN ';' 7728 ELSE '' 7729 END, 7730 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7731 ) 7732 """ 7733 sql_queries.append(sql_update_pztags) 7734 7735 # Add Query update for PZFlags for default 7736 if profile == default_profile: 7737 sql_update_pztags_default = f""" 7738 UPDATE {table_variants} 7739 SET INFO = concat( 7740 INFO, 7741 ';', 7742 '{pz_prefix}Tags={pztags_value}' 7743 ) 7744 """ 7745 sql_queries.append(sql_update_pztags_default) 7746 7747 log.info(f"""Profile '{profile}' - Prioritization... """) 7748 7749 if sql_queries: 7750 7751 for sql_query in sql_queries: 7752 log.debug( 7753 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7754 ) 7755 self.conn.execute(sql_query) 7756 7757 log.info(f"""Profile '{profile}' - Update... """) 7758 sql_query_update = f""" 7759 UPDATE {table_variants} 7760 SET INFO = 7761 concat( 7762 CASE 7763 WHEN INFO NOT IN ('','.') 7764 THEN concat(INFO, ';') 7765 ELSE '' 7766 END 7767 {sql_set_info_option} 7768 ) 7769 """ 7770 self.conn.execute(sql_query_update) 7771 7772 else: 7773 7774 log.warning(f"No profiles in parameters") 7775 7776 # Remove added columns 7777 for added_column in added_columns: 7778 self.drop_column(column=added_column) 7779 7780 # Explode INFOS fields into table fields 7781 if self.get_explode_infos(): 7782 self.explode_infos( 7783 prefix=self.get_explode_infos_prefix(), 7784 fields=self.get_explode_infos_fields(), 7785 force=True, 7786 ) 7787 7788 return True 7789 7790 ### 7791 # HGVS 7792 ### 7793 7794 def annotation_hgvs(self, threads: int = None) -> None: 7795 """ 7796 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7797 coordinates and alleles. 7798 7799 :param threads: The `threads` parameter is an optional integer that specifies the number of 7800 threads to use for parallel processing. If no value is provided, it will default to the number 7801 of threads obtained from the `get_threads()` method 7802 :type threads: int 7803 """ 7804 7805 # Function for each partition of the Dask Dataframe 7806 def partition_function(partition): 7807 """ 7808 The function `partition_function` applies the `annotation_hgvs_partition` function to 7809 each row of a DataFrame called `partition`. 7810 7811 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7812 to be processed 7813 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7814 the "partition" dataframe along the axis 1. 7815 """ 7816 return partition.apply(annotation_hgvs_partition, axis=1) 7817 7818 def annotation_hgvs_partition(row) -> str: 7819 """ 7820 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7821 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7822 7823 :param row: A dictionary-like object that contains the values for the following keys: 7824 :return: a string that contains the HGVS names associated with the given row of data. 7825 """ 7826 7827 chr = row["CHROM"] 7828 pos = row["POS"] 7829 ref = row["REF"] 7830 alt = row["ALT"] 7831 7832 # Find list of associated transcripts 7833 transcripts_list = list( 7834 polars_conn.execute( 7835 f""" 7836 SELECT transcript 7837 FROM refseq_df 7838 WHERE CHROM='{chr}' 7839 AND POS={pos} 7840 """ 7841 )["transcript"] 7842 ) 7843 7844 # Full HGVS annotation in list 7845 hgvs_full_list = [] 7846 7847 for transcript_name in transcripts_list: 7848 7849 # Transcript 7850 transcript = get_transcript( 7851 transcripts=transcripts, transcript_name=transcript_name 7852 ) 7853 # Exon 7854 if use_exon: 7855 exon = transcript.find_exon_number(pos) 7856 else: 7857 exon = None 7858 # Protein 7859 transcript_protein = None 7860 if use_protein or add_protein or full_format: 7861 transcripts_protein = list( 7862 polars_conn.execute( 7863 f""" 7864 SELECT protein 7865 FROM refseqlink_df 7866 WHERE transcript='{transcript_name}' 7867 LIMIT 1 7868 """ 7869 )["protein"] 7870 ) 7871 if len(transcripts_protein): 7872 transcript_protein = transcripts_protein[0] 7873 7874 # HGVS name 7875 hgvs_name = format_hgvs_name( 7876 chr, 7877 pos, 7878 ref, 7879 alt, 7880 genome=genome, 7881 transcript=transcript, 7882 transcript_protein=transcript_protein, 7883 exon=exon, 7884 use_gene=use_gene, 7885 use_protein=use_protein, 7886 full_format=full_format, 7887 use_version=use_version, 7888 codon_type=codon_type, 7889 ) 7890 hgvs_full_list.append(hgvs_name) 7891 if add_protein and not use_protein and not full_format: 7892 hgvs_name = format_hgvs_name( 7893 chr, 7894 pos, 7895 ref, 7896 alt, 7897 genome=genome, 7898 transcript=transcript, 7899 transcript_protein=transcript_protein, 7900 exon=exon, 7901 use_gene=use_gene, 7902 use_protein=True, 7903 full_format=False, 7904 use_version=use_version, 7905 codon_type=codon_type, 7906 ) 7907 hgvs_full_list.append(hgvs_name) 7908 7909 # Create liste of HGVS annotations 7910 hgvs_full = ",".join(hgvs_full_list) 7911 7912 return hgvs_full 7913 7914 # Polars connexion 7915 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7916 7917 # Config 7918 config = self.get_config() 7919 7920 # Databases 7921 # Genome 7922 databases_genomes_folders = ( 7923 config.get("folders", {}) 7924 .get("databases", {}) 7925 .get("genomes", DEFAULT_GENOME_FOLDER) 7926 ) 7927 databases_genome = ( 7928 config.get("folders", {}).get("databases", {}).get("genomes", "") 7929 ) 7930 # refseq database folder 7931 databases_refseq_folders = ( 7932 config.get("folders", {}) 7933 .get("databases", {}) 7934 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7935 ) 7936 # refseq 7937 databases_refseq = config.get("databases", {}).get("refSeq", None) 7938 # refSeqLink 7939 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7940 7941 # Param 7942 param = self.get_param() 7943 7944 # Quick HGVS 7945 if "hgvs_options" in param and param.get("hgvs_options", ""): 7946 log.info(f"Quick HGVS Annotation:") 7947 if not param.get("hgvs", None): 7948 param["hgvs"] = {} 7949 for option in param.get("hgvs_options", "").split(","): 7950 option_var_val = option.split("=") 7951 option_var = option_var_val[0] 7952 if len(option_var_val) > 1: 7953 option_val = option_var_val[1] 7954 else: 7955 option_val = "True" 7956 if option_val.upper() in ["TRUE"]: 7957 option_val = True 7958 elif option_val.upper() in ["FALSE"]: 7959 option_val = False 7960 log.info(f" {option_var}={option_val}") 7961 param["hgvs"][option_var] = option_val 7962 7963 # Check if HGVS annotation enabled 7964 if "hgvs" in param: 7965 log.info(f"HGVS Annotation... ") 7966 for hgvs_option in param.get("hgvs", {}): 7967 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7968 else: 7969 return 7970 7971 # HGVS Param 7972 param_hgvs = param.get("hgvs", {}) 7973 use_exon = param_hgvs.get("use_exon", False) 7974 use_gene = param_hgvs.get("use_gene", False) 7975 use_protein = param_hgvs.get("use_protein", False) 7976 add_protein = param_hgvs.get("add_protein", False) 7977 full_format = param_hgvs.get("full_format", False) 7978 use_version = param_hgvs.get("use_version", False) 7979 codon_type = param_hgvs.get("codon_type", "3") 7980 7981 # refSseq refSeqLink 7982 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7983 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7984 7985 # Assembly 7986 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7987 7988 # Genome 7989 genome_file = None 7990 if find_genome(databases_genome): 7991 genome_file = find_genome(databases_genome) 7992 else: 7993 genome_file = find_genome( 7994 genome_path=databases_genomes_folders, assembly=assembly 7995 ) 7996 log.debug("Genome: " + str(genome_file)) 7997 7998 # refSseq 7999 refseq_file = find_file_prefix( 8000 input_file=databases_refseq, 8001 prefix="ncbiRefSeq", 8002 folder=databases_refseq_folders, 8003 assembly=assembly, 8004 ) 8005 log.debug("refSeq: " + str(refseq_file)) 8006 8007 # refSeqLink 8008 refseqlink_file = find_file_prefix( 8009 input_file=databases_refseqlink, 8010 prefix="ncbiRefSeqLink", 8011 folder=databases_refseq_folders, 8012 assembly=assembly, 8013 ) 8014 log.debug("refSeqLink: " + str(refseqlink_file)) 8015 8016 # Threads 8017 if not threads: 8018 threads = self.get_threads() 8019 log.debug("Threads: " + str(threads)) 8020 8021 # Variables 8022 table_variants = self.get_table_variants(clause="update") 8023 8024 # Get variants SNV and InDel only 8025 query_variants = f""" 8026 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8027 FROM {table_variants} 8028 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8029 """ 8030 df_variants = self.get_query_to_df(query_variants) 8031 8032 # Added columns 8033 added_columns = [] 8034 8035 # Add hgvs column in variants table 8036 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8037 added_column = self.add_column( 8038 table_variants, hgvs_column_name, "STRING", default_value=None 8039 ) 8040 added_columns.append(added_column) 8041 8042 log.debug(f"refSeq loading...") 8043 # refSeq in duckDB 8044 refseq_table = get_refseq_table( 8045 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8046 ) 8047 # Loading all refSeq in Dataframe 8048 refseq_query = f""" 8049 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8050 FROM {refseq_table} 8051 JOIN df_variants ON ( 8052 {refseq_table}.chrom = df_variants.CHROM 8053 AND {refseq_table}.txStart<=df_variants.POS 8054 AND {refseq_table}.txEnd>=df_variants.POS 8055 ) 8056 """ 8057 refseq_df = self.conn.query(refseq_query).pl() 8058 8059 if refseqlink_file: 8060 log.debug(f"refSeqLink loading...") 8061 # refSeqLink in duckDB 8062 refseqlink_table = get_refseq_table( 8063 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8064 ) 8065 # Loading all refSeqLink in Dataframe 8066 protacc_column = "protAcc_with_ver" 8067 mrnaacc_column = "mrnaAcc_with_ver" 8068 refseqlink_query = f""" 8069 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8070 FROM {refseqlink_table} 8071 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8072 WHERE protAcc_without_ver IS NOT NULL 8073 """ 8074 # Polars Dataframe 8075 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8076 8077 # Read RefSeq transcripts into a python dict/model. 8078 log.debug(f"Transcripts loading...") 8079 with tempfile.TemporaryDirectory() as tmpdir: 8080 transcripts_query = f""" 8081 COPY ( 8082 SELECT {refseq_table}.* 8083 FROM {refseq_table} 8084 JOIN df_variants ON ( 8085 {refseq_table}.chrom=df_variants.CHROM 8086 AND {refseq_table}.txStart<=df_variants.POS 8087 AND {refseq_table}.txEnd>=df_variants.POS 8088 ) 8089 ) 8090 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8091 """ 8092 self.conn.query(transcripts_query) 8093 with open(f"{tmpdir}/transcript.tsv") as infile: 8094 transcripts = read_transcripts(infile) 8095 8096 # Polars connexion 8097 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8098 8099 log.debug("Genome loading...") 8100 # Read genome sequence using pyfaidx. 8101 genome = Fasta(genome_file) 8102 8103 log.debug("Start annotation HGVS...") 8104 8105 # Create 8106 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8107 ddf = dd.from_pandas(df_variants, npartitions=threads) 8108 8109 # Use dask.dataframe.apply() to apply function on each partition 8110 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8111 8112 # Convert Dask DataFrame to Pandas Dataframe 8113 df = ddf.compute() 8114 8115 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8116 with tempfile.TemporaryDirectory() as tmpdir: 8117 df_parquet = os.path.join(tmpdir, "df.parquet") 8118 df.to_parquet(df_parquet) 8119 8120 # Update hgvs column 8121 update_variant_query = f""" 8122 UPDATE {table_variants} 8123 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8124 FROM read_parquet('{df_parquet}') as df 8125 WHERE variants."#CHROM" = df.CHROM 8126 AND variants.POS = df.POS 8127 AND variants.REF = df.REF 8128 AND variants.ALT = df.ALT 8129 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8130 """ 8131 self.execute_query(update_variant_query) 8132 8133 # Update INFO column 8134 sql_query_update = f""" 8135 UPDATE {table_variants} 8136 SET INFO = 8137 concat( 8138 CASE 8139 WHEN INFO NOT IN ('','.') 8140 THEN concat(INFO, ';') 8141 ELSE '' 8142 END, 8143 'hgvs=', 8144 {hgvs_column_name} 8145 ) 8146 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8147 """ 8148 self.execute_query(sql_query_update) 8149 8150 # Add header 8151 HGVS_INFOS = { 8152 "hgvs": { 8153 "ID": "hgvs", 8154 "Number": ".", 8155 "Type": "String", 8156 "Description": f"HGVS annotatation with HOWARD", 8157 } 8158 } 8159 8160 for field in HGVS_INFOS: 8161 field_ID = HGVS_INFOS[field]["ID"] 8162 field_description = HGVS_INFOS[field]["Description"] 8163 self.get_header().infos[field_ID] = vcf.parser._Info( 8164 field_ID, 8165 HGVS_INFOS[field]["Number"], 8166 HGVS_INFOS[field]["Type"], 8167 field_description, 8168 "unknown", 8169 "unknown", 8170 code_type_map[HGVS_INFOS[field]["Type"]], 8171 ) 8172 8173 # Remove added columns 8174 for added_column in added_columns: 8175 self.drop_column(column=added_column) 8176 8177 ### 8178 # Calculation 8179 ### 8180 8181 def get_operations_help( 8182 self, operations_config_dict: dict = {}, operations_config_file: str = None 8183 ) -> list: 8184 8185 # Init 8186 operations_help = [] 8187 8188 # operations 8189 operations = self.get_config_json( 8190 name="calculations", 8191 config_dict=operations_config_dict, 8192 config_file=operations_config_file, 8193 ) 8194 for op in operations: 8195 op_name = operations[op].get("name", op).upper() 8196 op_description = operations[op].get("description", op_name) 8197 op_available = operations[op].get("available", False) 8198 if op_available: 8199 operations_help.append(f" {op_name}: {op_description}") 8200 8201 # Sort operations 8202 operations_help.sort() 8203 8204 # insert header 8205 operations_help.insert(0, "Available calculation operations:") 8206 8207 # Return 8208 return operations_help 8209 8210 def calculation( 8211 self, 8212 operations: dict = {}, 8213 operations_config_dict: dict = {}, 8214 operations_config_file: str = None, 8215 ) -> None: 8216 """ 8217 It takes a list of operations, and for each operation, it checks if it's a python or sql 8218 operation, and then calls the appropriate function 8219 8220 param json example: 8221 "calculation": { 8222 "NOMEN": { 8223 "options": { 8224 "hgvs_field": "hgvs" 8225 }, 8226 "middle" : null 8227 } 8228 """ 8229 8230 # Param 8231 param = self.get_param() 8232 8233 # operations config 8234 operations_config = self.get_config_json( 8235 name="calculations", 8236 config_dict=operations_config_dict, 8237 config_file=operations_config_file, 8238 ) 8239 8240 # Upper keys 8241 operations_config = {k.upper(): v for k, v in operations_config.items()} 8242 8243 # Calculations 8244 8245 # Operations from param 8246 operations = param.get("calculation", {}).get("calculations", operations) 8247 8248 # Quick calculation - add 8249 if param.get("calculations", None): 8250 8251 # List of operations 8252 calculations_list = [ 8253 value.strip() for value in param.get("calculations", "").split(",") 8254 ] 8255 8256 # Log 8257 log.info(f"Quick Calculations:") 8258 for calculation_key in calculations_list: 8259 log.info(f" {calculation_key}") 8260 8261 # Create tmp operations (to keep operation order) 8262 operations_tmp = {} 8263 for calculation_operation in calculations_list: 8264 if calculation_operation.upper() not in operations_tmp: 8265 log.debug( 8266 f"{calculation_operation}.upper() not in {operations_tmp}" 8267 ) 8268 operations_tmp[calculation_operation.upper()] = {} 8269 add_value_into_dict( 8270 dict_tree=operations_tmp, 8271 sections=[ 8272 calculation_operation.upper(), 8273 ], 8274 value=operations.get(calculation_operation.upper(), {}), 8275 ) 8276 # Add operations already in param 8277 for calculation_operation in operations: 8278 if calculation_operation not in operations_tmp: 8279 operations_tmp[calculation_operation] = operations.get( 8280 calculation_operation, {} 8281 ) 8282 8283 # Update operations in param 8284 operations = operations_tmp 8285 8286 # Operations for calculation 8287 if not operations: 8288 operations = param.get("calculation", {}).get("calculations", {}) 8289 8290 if operations: 8291 log.info(f"Calculations...") 8292 8293 # For each operations 8294 for operation_name in operations: 8295 operation_name = operation_name.upper() 8296 if operation_name not in [""]: 8297 if operation_name in operations_config: 8298 log.info(f"Calculation '{operation_name}'") 8299 operation = operations_config[operation_name] 8300 operation_type = operation.get("type", "sql") 8301 if operation_type == "python": 8302 self.calculation_process_function( 8303 operation=operation, operation_name=operation_name 8304 ) 8305 elif operation_type == "sql": 8306 self.calculation_process_sql( 8307 operation=operation, operation_name=operation_name 8308 ) 8309 else: 8310 log.error( 8311 f"Operations config: Type '{operation_type}' NOT available" 8312 ) 8313 raise ValueError( 8314 f"Operations config: Type '{operation_type}' NOT available" 8315 ) 8316 else: 8317 log.error( 8318 f"Operations config: Calculation '{operation_name}' NOT available" 8319 ) 8320 raise ValueError( 8321 f"Operations config: Calculation '{operation_name}' NOT available" 8322 ) 8323 8324 # Explode INFOS fields into table fields 8325 if self.get_explode_infos(): 8326 self.explode_infos( 8327 prefix=self.get_explode_infos_prefix(), 8328 fields=self.get_explode_infos_fields(), 8329 force=True, 8330 ) 8331 8332 def calculation_process_sql( 8333 self, operation: dict, operation_name: str = "unknown" 8334 ) -> None: 8335 """ 8336 The `calculation_process_sql` function takes in a mathematical operation as a string and 8337 performs the operation, updating the specified table with the result. 8338 8339 :param operation: The `operation` parameter is a dictionary that contains information about the 8340 mathematical operation to be performed. It includes the following keys: 8341 :type operation: dict 8342 :param operation_name: The `operation_name` parameter is a string that represents the name of 8343 the mathematical operation being performed. It is used for logging and error handling purposes, 8344 defaults to unknown 8345 :type operation_name: str (optional) 8346 """ 8347 8348 # Operation infos 8349 operation_name = operation.get("name", "unknown") 8350 log.debug(f"process sql {operation_name}") 8351 output_column_name = operation.get("output_column_name", operation_name) 8352 output_column_type = operation.get("output_column_type", "String") 8353 prefix = operation.get("explode_infos_prefix", "") 8354 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8355 output_column_description = operation.get( 8356 "output_column_description", f"{operation_name} operation" 8357 ) 8358 operation_query = operation.get("operation_query", None) 8359 if isinstance(operation_query, list): 8360 operation_query = " ".join(operation_query) 8361 operation_info_fields = operation.get("info_fields", []) 8362 operation_info_fields_check = operation.get("info_fields_check", False) 8363 operation_info = operation.get("operation_info", True) 8364 operation_table = operation.get( 8365 "table", self.get_table_variants(clause="alter") 8366 ) 8367 8368 # table variants 8369 if operation_table: 8370 table_variants = operation_table 8371 else: 8372 table_variants = self.get_table_variants(clause="alter") 8373 8374 if operation_query: 8375 8376 # Info fields check 8377 operation_info_fields_check_result = True 8378 if operation_info_fields_check: 8379 header_infos = self.get_header().infos 8380 for info_field in operation_info_fields: 8381 operation_info_fields_check_result = ( 8382 operation_info_fields_check_result 8383 and info_field in header_infos 8384 ) 8385 8386 # If info fields available 8387 if operation_info_fields_check_result: 8388 8389 # Added_columns 8390 added_columns = [] 8391 8392 # Create VCF header field 8393 vcf_reader = self.get_header() 8394 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8395 output_column_name, 8396 ".", 8397 output_column_type, 8398 output_column_description, 8399 "howard calculation", 8400 "0", 8401 self.code_type_map.get(output_column_type), 8402 ) 8403 8404 # Explode infos if needed 8405 log.debug(f"calculation_process_sql prefix {prefix}") 8406 added_columns += self.explode_infos( 8407 prefix=prefix, 8408 fields=[output_column_name] + operation_info_fields, 8409 force=False, 8410 table=table_variants, 8411 ) 8412 8413 # Create column 8414 added_column = self.add_column( 8415 table_name=table_variants, 8416 column_name=prefix + output_column_name, 8417 column_type=output_column_type_sql, 8418 default_value="null", 8419 ) 8420 added_columns.append(added_column) 8421 8422 # Operation calculation 8423 try: 8424 8425 # Query to update calculation column 8426 sql_update = f""" 8427 UPDATE {table_variants} 8428 SET "{prefix}{output_column_name}" = ({operation_query}) 8429 """ 8430 self.conn.execute(sql_update) 8431 8432 # Add to INFO 8433 if operation_info: 8434 sql_update_info = f""" 8435 UPDATE {table_variants} 8436 SET "INFO" = 8437 concat( 8438 CASE 8439 WHEN "INFO" IS NOT NULL 8440 THEN concat("INFO", ';') 8441 ELSE '' 8442 END, 8443 '{output_column_name}=', 8444 "{prefix}{output_column_name}" 8445 ) 8446 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8447 """ 8448 self.conn.execute(sql_update_info) 8449 8450 except: 8451 log.error( 8452 f"Operations config: Calculation '{operation_name}' query failed" 8453 ) 8454 raise ValueError( 8455 f"Operations config: Calculation '{operation_name}' query failed" 8456 ) 8457 8458 # Remove added columns 8459 for added_column in added_columns: 8460 log.debug(f"added_column: {added_column}") 8461 self.drop_column(column=added_column) 8462 8463 else: 8464 log.error( 8465 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8466 ) 8467 raise ValueError( 8468 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8469 ) 8470 8471 else: 8472 log.error( 8473 f"Operations config: Calculation '{operation_name}' query NOT defined" 8474 ) 8475 raise ValueError( 8476 f"Operations config: Calculation '{operation_name}' query NOT defined" 8477 ) 8478 8479 def calculation_process_function( 8480 self, operation: dict, operation_name: str = "unknown" 8481 ) -> None: 8482 """ 8483 The `calculation_process_function` takes in an operation dictionary and performs the specified 8484 function with the given parameters. 8485 8486 :param operation: The `operation` parameter is a dictionary that contains information about the 8487 operation to be performed. It has the following keys: 8488 :type operation: dict 8489 :param operation_name: The `operation_name` parameter is a string that represents the name of 8490 the operation being performed. It is used for logging purposes, defaults to unknown 8491 :type operation_name: str (optional) 8492 """ 8493 8494 operation_name = operation["name"] 8495 log.debug(f"process sql {operation_name}") 8496 function_name = operation["function_name"] 8497 function_params = operation["function_params"] 8498 getattr(self, function_name)(*function_params) 8499 8500 def calculation_variant_id(self) -> None: 8501 """ 8502 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8503 updates the INFO field of a variants table with the variant ID. 8504 """ 8505 8506 # variant_id annotation field 8507 variant_id_tag = self.get_variant_id_column() 8508 added_columns = [variant_id_tag] 8509 8510 # variant_id hgvs tags" 8511 vcf_infos_tags = { 8512 variant_id_tag: "howard variant ID annotation", 8513 } 8514 8515 # Variants table 8516 table_variants = self.get_table_variants() 8517 8518 # Header 8519 vcf_reader = self.get_header() 8520 8521 # Add variant_id to header 8522 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8523 variant_id_tag, 8524 ".", 8525 "String", 8526 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8527 "howard calculation", 8528 "0", 8529 self.code_type_map.get("String"), 8530 ) 8531 8532 # Update 8533 sql_update = f""" 8534 UPDATE {table_variants} 8535 SET "INFO" = 8536 concat( 8537 CASE 8538 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8539 THEN '' 8540 ELSE concat("INFO", ';') 8541 END, 8542 '{variant_id_tag}=', 8543 "{variant_id_tag}" 8544 ) 8545 """ 8546 self.conn.execute(sql_update) 8547 8548 # Remove added columns 8549 for added_column in added_columns: 8550 self.drop_column(column=added_column) 8551 8552 def calculation_extract_snpeff_hgvs( 8553 self, 8554 snpeff_hgvs: str = "snpeff_hgvs", 8555 snpeff_field: str = "ANN", 8556 ) -> None: 8557 """ 8558 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8559 annotation field in a VCF file and adds them as a new column in the variants table. 8560 8561 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8562 function is used to specify the name of the column that will store the HGVS nomenclatures 8563 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8564 snpeff_hgvs 8565 :type snpeff_hgvs: str (optional) 8566 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8567 function represents the field in the VCF file that contains SnpEff annotations. This field is 8568 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8569 to ANN 8570 :type snpeff_field: str (optional) 8571 """ 8572 8573 # Snpeff hgvs tags 8574 vcf_infos_tags = { 8575 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8576 } 8577 8578 # Prefix 8579 prefix = self.get_explode_infos_prefix() 8580 if prefix: 8581 prefix = "INFO/" 8582 8583 # snpEff fields 8584 speff_ann_infos = prefix + snpeff_field 8585 speff_hgvs_infos = prefix + snpeff_hgvs 8586 8587 # Variants table 8588 table_variants = self.get_table_variants() 8589 8590 # Header 8591 vcf_reader = self.get_header() 8592 8593 # Add columns 8594 added_columns = [] 8595 8596 # Explode HGVS field in column 8597 added_columns += self.explode_infos(fields=[snpeff_field]) 8598 8599 if snpeff_field in vcf_reader.infos: 8600 8601 log.debug(vcf_reader.infos[snpeff_field]) 8602 8603 # Extract ANN header 8604 ann_description = vcf_reader.infos[snpeff_field].desc 8605 pattern = r"'(.+?)'" 8606 match = re.search(pattern, ann_description) 8607 if match: 8608 ann_header_match = match.group(1).split(" | ") 8609 ann_header_desc = {} 8610 for i in range(len(ann_header_match)): 8611 ann_header_info = "".join( 8612 char for char in ann_header_match[i] if char.isalnum() 8613 ) 8614 ann_header_desc[ann_header_info] = ann_header_match[i] 8615 if not ann_header_desc: 8616 raise ValueError("Invalid header description format") 8617 else: 8618 raise ValueError("Invalid header description format") 8619 8620 # Create variant id 8621 variant_id_column = self.get_variant_id_column() 8622 added_columns += [variant_id_column] 8623 8624 # Create dataframe 8625 dataframe_snpeff_hgvs = self.get_query_to_df( 8626 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8627 ) 8628 8629 # Create main NOMEN column 8630 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8631 speff_ann_infos 8632 ].apply( 8633 lambda x: extract_snpeff_hgvs( 8634 str(x), header=list(ann_header_desc.values()) 8635 ) 8636 ) 8637 8638 # Add snpeff_hgvs to header 8639 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8640 snpeff_hgvs, 8641 ".", 8642 "String", 8643 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8644 "howard calculation", 8645 "0", 8646 self.code_type_map.get("String"), 8647 ) 8648 8649 # Update 8650 sql_update = f""" 8651 UPDATE variants 8652 SET "INFO" = 8653 concat( 8654 CASE 8655 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8656 THEN '' 8657 ELSE concat("INFO", ';') 8658 END, 8659 CASE 8660 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8661 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8662 THEN concat( 8663 '{snpeff_hgvs}=', 8664 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8665 ) 8666 ELSE '' 8667 END 8668 ) 8669 FROM dataframe_snpeff_hgvs 8670 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8671 8672 """ 8673 self.conn.execute(sql_update) 8674 8675 # Delete dataframe 8676 del dataframe_snpeff_hgvs 8677 gc.collect() 8678 8679 else: 8680 8681 log.warning( 8682 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8683 ) 8684 8685 # Remove added columns 8686 for added_column in added_columns: 8687 self.drop_column(column=added_column) 8688 8689 def calculation_snpeff_ann_explode( 8690 self, 8691 uniquify: bool = True, 8692 output_format: str = "fields", 8693 output_prefix: str = "snpeff_", 8694 snpeff_field: str = "ANN", 8695 ) -> None: 8696 """ 8697 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8698 exploding the HGVS field and updating variant information accordingly. 8699 8700 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8701 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8702 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8703 defaults to True 8704 :type uniquify: bool (optional) 8705 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8706 function specifies the format in which the output annotations will be generated. It has a 8707 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8708 format, defaults to fields 8709 :type output_format: str (optional) 8710 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8711 method is used to specify the prefix that will be added to the output annotations generated 8712 during the calculation process. This prefix helps to differentiate the newly added annotations 8713 from existing ones in the output data. By default, the, defaults to ANN_ 8714 :type output_prefix: str (optional) 8715 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8716 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8717 field will be processed to explode the HGVS annotations and update the variant information 8718 accordingly, defaults to ANN 8719 :type snpeff_field: str (optional) 8720 """ 8721 8722 # SnpEff annotation field 8723 snpeff_hgvs = "snpeff_ann_explode" 8724 8725 # Snpeff hgvs tags 8726 vcf_infos_tags = { 8727 snpeff_hgvs: "Explode snpEff annotations", 8728 } 8729 8730 # Prefix 8731 prefix = self.get_explode_infos_prefix() 8732 if prefix: 8733 prefix = "INFO/" 8734 8735 # snpEff fields 8736 speff_ann_infos = prefix + snpeff_field 8737 speff_hgvs_infos = prefix + snpeff_hgvs 8738 8739 # Variants table 8740 table_variants = self.get_table_variants() 8741 8742 # Header 8743 vcf_reader = self.get_header() 8744 8745 # Add columns 8746 added_columns = [] 8747 8748 # Explode HGVS field in column 8749 added_columns += self.explode_infos(fields=[snpeff_field]) 8750 log.debug(f"snpeff_field={snpeff_field}") 8751 log.debug(f"added_columns={added_columns}") 8752 8753 if snpeff_field in vcf_reader.infos: 8754 8755 # Extract ANN header 8756 ann_description = vcf_reader.infos[snpeff_field].desc 8757 pattern = r"'(.+?)'" 8758 match = re.search(pattern, ann_description) 8759 if match: 8760 ann_header_match = match.group(1).split(" | ") 8761 ann_header = [] 8762 ann_header_desc = {} 8763 for i in range(len(ann_header_match)): 8764 ann_header_info = "".join( 8765 char for char in ann_header_match[i] if char.isalnum() 8766 ) 8767 ann_header.append(ann_header_info) 8768 ann_header_desc[ann_header_info] = ann_header_match[i] 8769 if not ann_header_desc: 8770 raise ValueError("Invalid header description format") 8771 else: 8772 raise ValueError("Invalid header description format") 8773 8774 # Create variant id 8775 variant_id_column = self.get_variant_id_column() 8776 added_columns += [variant_id_column] 8777 8778 # Create dataframe 8779 dataframe_snpeff_hgvs = self.get_query_to_df( 8780 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8781 ) 8782 8783 # Create snpEff columns 8784 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8785 speff_ann_infos 8786 ].apply( 8787 lambda x: explode_snpeff_ann( 8788 str(x), 8789 uniquify=uniquify, 8790 output_format=output_format, 8791 prefix=output_prefix, 8792 header=list(ann_header_desc.values()), 8793 ) 8794 ) 8795 8796 # Header 8797 ann_annotations_prefix = "" 8798 if output_format.upper() in ["JSON"]: 8799 ann_annotations_prefix = f"{output_prefix}=" 8800 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8801 output_prefix, 8802 ".", 8803 "String", 8804 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8805 + " - JSON format", 8806 "howard calculation", 8807 "0", 8808 self.code_type_map.get("String"), 8809 ) 8810 else: 8811 for ann_annotation in ann_header: 8812 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8813 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8814 ann_annotation_id, 8815 ".", 8816 "String", 8817 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8818 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8819 "howard calculation", 8820 "0", 8821 self.code_type_map.get("String"), 8822 ) 8823 8824 # Update 8825 sql_update = f""" 8826 UPDATE variants 8827 SET "INFO" = 8828 concat( 8829 CASE 8830 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8831 THEN '' 8832 ELSE concat("INFO", ';') 8833 END, 8834 CASE 8835 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8836 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8837 THEN concat( 8838 '{ann_annotations_prefix}', 8839 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8840 ) 8841 ELSE '' 8842 END 8843 ) 8844 FROM dataframe_snpeff_hgvs 8845 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8846 8847 """ 8848 self.conn.execute(sql_update) 8849 8850 # Delete dataframe 8851 del dataframe_snpeff_hgvs 8852 gc.collect() 8853 8854 else: 8855 8856 log.warning( 8857 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8858 ) 8859 8860 # Remove added columns 8861 for added_column in added_columns: 8862 self.drop_column(column=added_column) 8863 8864 def calculation_extract_nomen(self) -> None: 8865 """ 8866 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8867 """ 8868 8869 # NOMEN field 8870 field_nomen_dict = "NOMEN_DICT" 8871 8872 # NOMEN structure 8873 nomen_dict = { 8874 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8875 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8876 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8877 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8878 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8879 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8880 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8881 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8882 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8883 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8884 } 8885 8886 # Param 8887 param = self.get_param() 8888 8889 # Prefix 8890 prefix = self.get_explode_infos_prefix() 8891 8892 # Header 8893 vcf_reader = self.get_header() 8894 8895 # Added columns 8896 added_columns = [] 8897 8898 # Get HGVS field 8899 hgvs_field = ( 8900 param.get("calculation", {}) 8901 .get("calculations", {}) 8902 .get("NOMEN", {}) 8903 .get("options", {}) 8904 .get("hgvs_field", "hgvs") 8905 ) 8906 8907 # Get NOMEN pattern 8908 nomen_pattern = ( 8909 param.get("calculation", {}) 8910 .get("calculations", {}) 8911 .get("NOMEN", {}) 8912 .get("options", {}) 8913 .get("pattern", None) 8914 ) 8915 8916 # transcripts list of preference sources 8917 transcripts_sources = {} 8918 8919 # Get transcripts 8920 transcripts_file = ( 8921 param.get("calculation", {}) 8922 .get("calculations", {}) 8923 .get("NOMEN", {}) 8924 .get("options", {}) 8925 .get("transcripts", None) 8926 ) 8927 transcripts_file = full_path(transcripts_file) 8928 if transcripts_file: 8929 if os.path.exists(transcripts_file): 8930 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8931 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8932 transcripts_sources["file"] = transcripts_from_file 8933 else: 8934 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8935 log.error(msg_err) 8936 raise ValueError(msg_err) 8937 8938 # Get transcripts table 8939 transcripts_table = ( 8940 param.get("calculation", {}) 8941 .get("calculations", {}) 8942 .get("NOMEN", {}) 8943 .get("options", {}) 8944 .get("transcripts_table", self.get_table_variants()) 8945 ) 8946 # Get transcripts column 8947 transcripts_column = ( 8948 param.get("calculation", {}) 8949 .get("calculations", {}) 8950 .get("NOMEN", {}) 8951 .get("options", {}) 8952 .get("transcripts_column", None) 8953 ) 8954 8955 if transcripts_table and transcripts_column: 8956 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8957 # Explode if not exists 8958 self.explode_infos(fields=[transcripts_column], table=transcripts_table) 8959 else: 8960 extra_field_transcript = f"NULL" 8961 8962 # Transcripts of preference source order 8963 transcripts_order = ( 8964 param.get("calculation", {}) 8965 .get("calculations", {}) 8966 .get("NOMEN", {}) 8967 .get("options", {}) 8968 .get("transcripts_order", ["column", "file"]) 8969 ) 8970 8971 # Transcripts from file 8972 transcripts = transcripts_sources.get("file", []) 8973 8974 # Explode HGVS field in column 8975 added_columns += self.explode_infos(fields=[hgvs_field]) 8976 8977 # extra infos 8978 extra_infos = self.get_extra_infos() 8979 extra_field = prefix + hgvs_field 8980 8981 if extra_field in extra_infos: 8982 8983 # Create dataframe 8984 dataframe_hgvs = self.get_query_to_df( 8985 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 8986 ) 8987 8988 # Create main NOMEN column 8989 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 8990 lambda x: find_nomen( 8991 hgvs=x.hgvs, 8992 transcript=x.transcript, 8993 transcripts=transcripts, 8994 pattern=nomen_pattern, 8995 transcripts_source_order=transcripts_order, 8996 ), 8997 axis=1, 8998 ) 8999 9000 # Explode NOMEN Structure and create SQL set for update 9001 sql_nomen_fields = [] 9002 for nomen_field in nomen_dict: 9003 9004 # Explode each field into a column 9005 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 9006 lambda x: dict(x).get(nomen_field, "") 9007 ) 9008 9009 # Create VCF header field 9010 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9011 nomen_field, 9012 ".", 9013 "String", 9014 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9015 "howard calculation", 9016 "0", 9017 self.code_type_map.get("String"), 9018 ) 9019 sql_nomen_fields.append( 9020 f""" 9021 CASE 9022 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 9023 THEN concat( 9024 ';{nomen_field}=', 9025 dataframe_hgvs."{nomen_field}" 9026 ) 9027 ELSE '' 9028 END 9029 """ 9030 ) 9031 9032 # SQL set for update 9033 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9034 9035 # Update 9036 sql_update = f""" 9037 UPDATE variants 9038 SET "INFO" = 9039 concat( 9040 CASE 9041 WHEN "INFO" IS NULL 9042 THEN '' 9043 ELSE "INFO" 9044 END, 9045 {sql_nomen_fields_set} 9046 ) 9047 FROM dataframe_hgvs 9048 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9049 AND variants."POS" = dataframe_hgvs."POS" 9050 AND variants."REF" = dataframe_hgvs."REF" 9051 AND variants."ALT" = dataframe_hgvs."ALT" 9052 """ 9053 self.conn.execute(sql_update) 9054 9055 # Delete dataframe 9056 del dataframe_hgvs 9057 gc.collect() 9058 9059 # Remove added columns 9060 for added_column in added_columns: 9061 self.drop_column(column=added_column) 9062 9063 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9064 """ 9065 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9066 pipeline/sample for a variant and updates the variant information in a VCF file. 9067 9068 :param tag: The `tag` parameter is a string that represents the annotation field for the 9069 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9070 VCF header and to update the corresponding field in the variants table, defaults to 9071 findbypipeline 9072 :type tag: str (optional) 9073 """ 9074 9075 # if FORMAT and samples 9076 if ( 9077 "FORMAT" in self.get_header_columns_as_list() 9078 and self.get_header_sample_list() 9079 ): 9080 9081 # findbypipeline annotation field 9082 findbypipeline_tag = tag 9083 9084 # VCF infos tags 9085 vcf_infos_tags = { 9086 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9087 } 9088 9089 # Prefix 9090 prefix = self.get_explode_infos_prefix() 9091 9092 # Field 9093 findbypipeline_infos = prefix + findbypipeline_tag 9094 9095 # Variants table 9096 table_variants = self.get_table_variants() 9097 9098 # Header 9099 vcf_reader = self.get_header() 9100 9101 # Create variant id 9102 variant_id_column = self.get_variant_id_column() 9103 added_columns = [variant_id_column] 9104 9105 # variant_id, FORMAT and samples 9106 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9107 self.get_header_sample_list() 9108 ) 9109 9110 # Create dataframe 9111 dataframe_findbypipeline = self.get_query_to_df( 9112 f""" SELECT {samples_fields} FROM {table_variants} """ 9113 ) 9114 9115 # Create findbypipeline column 9116 dataframe_findbypipeline[findbypipeline_infos] = ( 9117 dataframe_findbypipeline.apply( 9118 lambda row: findbypipeline( 9119 row, samples=self.get_header_sample_list() 9120 ), 9121 axis=1, 9122 ) 9123 ) 9124 9125 # Add snpeff_hgvs to header 9126 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9127 findbypipeline_tag, 9128 ".", 9129 "String", 9130 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9131 "howard calculation", 9132 "0", 9133 self.code_type_map.get("String"), 9134 ) 9135 9136 # Update 9137 sql_update = f""" 9138 UPDATE variants 9139 SET "INFO" = 9140 concat( 9141 CASE 9142 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9143 THEN '' 9144 ELSE concat("INFO", ';') 9145 END, 9146 CASE 9147 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9148 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9149 THEN concat( 9150 '{findbypipeline_tag}=', 9151 dataframe_findbypipeline."{findbypipeline_infos}" 9152 ) 9153 ELSE '' 9154 END 9155 ) 9156 FROM dataframe_findbypipeline 9157 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9158 """ 9159 self.conn.execute(sql_update) 9160 9161 # Remove added columns 9162 for added_column in added_columns: 9163 self.drop_column(column=added_column) 9164 9165 # Delete dataframe 9166 del dataframe_findbypipeline 9167 gc.collect() 9168 9169 def calculation_genotype_concordance(self) -> None: 9170 """ 9171 The function `calculation_genotype_concordance` calculates the genotype concordance for 9172 multi-caller VCF files and updates the variant information in the database. 9173 """ 9174 9175 # if FORMAT and samples 9176 if ( 9177 "FORMAT" in self.get_header_columns_as_list() 9178 and self.get_header_sample_list() 9179 ): 9180 9181 # genotypeconcordance annotation field 9182 genotypeconcordance_tag = "genotypeconcordance" 9183 9184 # VCF infos tags 9185 vcf_infos_tags = { 9186 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9187 } 9188 9189 # Prefix 9190 prefix = self.get_explode_infos_prefix() 9191 9192 # Field 9193 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9194 9195 # Variants table 9196 table_variants = self.get_table_variants() 9197 9198 # Header 9199 vcf_reader = self.get_header() 9200 9201 # Create variant id 9202 variant_id_column = self.get_variant_id_column() 9203 added_columns = [variant_id_column] 9204 9205 # variant_id, FORMAT and samples 9206 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9207 self.get_header_sample_list() 9208 ) 9209 9210 # Create dataframe 9211 dataframe_genotypeconcordance = self.get_query_to_df( 9212 f""" SELECT {samples_fields} FROM {table_variants} """ 9213 ) 9214 9215 # Create genotypeconcordance column 9216 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9217 dataframe_genotypeconcordance.apply( 9218 lambda row: genotypeconcordance( 9219 row, samples=self.get_header_sample_list() 9220 ), 9221 axis=1, 9222 ) 9223 ) 9224 9225 # Add genotypeconcordance to header 9226 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9227 genotypeconcordance_tag, 9228 ".", 9229 "String", 9230 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9231 "howard calculation", 9232 "0", 9233 self.code_type_map.get("String"), 9234 ) 9235 9236 # Update 9237 sql_update = f""" 9238 UPDATE variants 9239 SET "INFO" = 9240 concat( 9241 CASE 9242 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9243 THEN '' 9244 ELSE concat("INFO", ';') 9245 END, 9246 CASE 9247 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9248 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9249 THEN concat( 9250 '{genotypeconcordance_tag}=', 9251 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9252 ) 9253 ELSE '' 9254 END 9255 ) 9256 FROM dataframe_genotypeconcordance 9257 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9258 """ 9259 self.conn.execute(sql_update) 9260 9261 # Remove added columns 9262 for added_column in added_columns: 9263 self.drop_column(column=added_column) 9264 9265 # Delete dataframe 9266 del dataframe_genotypeconcordance 9267 gc.collect() 9268 9269 def calculation_barcode(self, tag: str = "barcode") -> None: 9270 """ 9271 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9272 updates the INFO field in the file with the calculated barcode values. 9273 9274 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9275 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9276 the default tag name is set to "barcode", defaults to barcode 9277 :type tag: str (optional) 9278 """ 9279 9280 # if FORMAT and samples 9281 if ( 9282 "FORMAT" in self.get_header_columns_as_list() 9283 and self.get_header_sample_list() 9284 ): 9285 9286 # barcode annotation field 9287 if not tag: 9288 tag = "barcode" 9289 9290 # VCF infos tags 9291 vcf_infos_tags = { 9292 tag: "barcode calculation (VaRank)", 9293 } 9294 9295 # Prefix 9296 prefix = self.get_explode_infos_prefix() 9297 9298 # Field 9299 barcode_infos = prefix + tag 9300 9301 # Variants table 9302 table_variants = self.get_table_variants() 9303 9304 # Header 9305 vcf_reader = self.get_header() 9306 9307 # Create variant id 9308 variant_id_column = self.get_variant_id_column() 9309 added_columns = [variant_id_column] 9310 9311 # variant_id, FORMAT and samples 9312 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9313 self.get_header_sample_list() 9314 ) 9315 9316 # Create dataframe 9317 dataframe_barcode = self.get_query_to_df( 9318 f""" SELECT {samples_fields} FROM {table_variants} """ 9319 ) 9320 9321 # Create barcode column 9322 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9323 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9324 ) 9325 9326 # Add barcode to header 9327 vcf_reader.infos[tag] = vcf.parser._Info( 9328 tag, 9329 ".", 9330 "String", 9331 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9332 "howard calculation", 9333 "0", 9334 self.code_type_map.get("String"), 9335 ) 9336 9337 # Update 9338 sql_update = f""" 9339 UPDATE {table_variants} 9340 SET "INFO" = 9341 concat( 9342 CASE 9343 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9344 THEN '' 9345 ELSE concat("INFO", ';') 9346 END, 9347 CASE 9348 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9349 AND dataframe_barcode."{barcode_infos}" NOT NULL 9350 THEN concat( 9351 '{tag}=', 9352 dataframe_barcode."{barcode_infos}" 9353 ) 9354 ELSE '' 9355 END 9356 ) 9357 FROM dataframe_barcode 9358 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9359 """ 9360 self.conn.execute(sql_update) 9361 9362 # Remove added columns 9363 for added_column in added_columns: 9364 self.drop_column(column=added_column) 9365 9366 # Delete dataframe 9367 del dataframe_barcode 9368 gc.collect() 9369 9370 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9371 """ 9372 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9373 and updates the INFO field in the file with the calculated barcode values. 9374 9375 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9376 the barcode tag that will be added to the VCF file during the calculation process. If no value 9377 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9378 :type tag: str (optional) 9379 """ 9380 9381 # if FORMAT and samples 9382 if ( 9383 "FORMAT" in self.get_header_columns_as_list() 9384 and self.get_header_sample_list() 9385 ): 9386 9387 # barcode annotation field 9388 if not tag: 9389 tag = "BCF" 9390 9391 # VCF infos tags 9392 vcf_infos_tags = { 9393 tag: "barcode family calculation", 9394 f"{tag}S": "barcode family samples", 9395 } 9396 9397 # Param 9398 param = self.get_param() 9399 log.debug(f"param={param}") 9400 9401 # Prefix 9402 prefix = self.get_explode_infos_prefix() 9403 9404 # PED param 9405 ped = ( 9406 param.get("calculation", {}) 9407 .get("calculations", {}) 9408 .get("BARCODEFAMILY", {}) 9409 .get("family_pedigree", None) 9410 ) 9411 log.debug(f"ped={ped}") 9412 9413 # Load PED 9414 if ped: 9415 9416 # Pedigree is a file 9417 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9418 log.debug("Pedigree is file") 9419 with open(full_path(ped)) as ped: 9420 ped = json.load(ped) 9421 9422 # Pedigree is a string 9423 elif isinstance(ped, str): 9424 log.debug("Pedigree is str") 9425 try: 9426 ped = json.loads(ped) 9427 log.debug("Pedigree is json str") 9428 except ValueError as e: 9429 ped_samples = ped.split(",") 9430 ped = {} 9431 for ped_sample in ped_samples: 9432 ped[ped_sample] = ped_sample 9433 9434 # Pedigree is a dict 9435 elif isinstance(ped, dict): 9436 log.debug("Pedigree is dict") 9437 9438 # Pedigree is not well formatted 9439 else: 9440 msg_error = "Pedigree not well formatted" 9441 log.error(msg_error) 9442 raise ValueError(msg_error) 9443 9444 # Construct list 9445 ped_samples = list(ped.values()) 9446 9447 else: 9448 log.debug("Pedigree not defined. Take all samples") 9449 ped_samples = self.get_header_sample_list() 9450 ped = {} 9451 for ped_sample in ped_samples: 9452 ped[ped_sample] = ped_sample 9453 9454 # Check pedigree 9455 if not ped or len(ped) == 0: 9456 msg_error = f"Error in pedigree: samples {ped_samples}" 9457 log.error(msg_error) 9458 raise ValueError(msg_error) 9459 9460 # Log 9461 log.info( 9462 "Calculation 'BARCODEFAMILY' - Samples: " 9463 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9464 ) 9465 log.debug(f"ped_samples={ped_samples}") 9466 9467 # Field 9468 barcode_infos = prefix + tag 9469 9470 # Variants table 9471 table_variants = self.get_table_variants() 9472 9473 # Header 9474 vcf_reader = self.get_header() 9475 9476 # Create variant id 9477 variant_id_column = self.get_variant_id_column() 9478 added_columns = [variant_id_column] 9479 9480 # variant_id, FORMAT and samples 9481 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9482 ped_samples 9483 ) 9484 9485 # Create dataframe 9486 dataframe_barcode = self.get_query_to_df( 9487 f""" SELECT {samples_fields} FROM {table_variants} """ 9488 ) 9489 9490 # Create barcode column 9491 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9492 lambda row: barcode(row, samples=ped_samples), axis=1 9493 ) 9494 9495 # Add barcode family to header 9496 # Add vaf_normalization to header 9497 vcf_reader.formats[tag] = vcf.parser._Format( 9498 id=tag, 9499 num=".", 9500 type="String", 9501 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9502 type_code=self.code_type_map.get("String"), 9503 ) 9504 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9505 id=f"{tag}S", 9506 num=".", 9507 type="String", 9508 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9509 type_code=self.code_type_map.get("String"), 9510 ) 9511 9512 # Update 9513 # for sample in ped_samples: 9514 sql_update_set = [] 9515 for sample in self.get_header_sample_list() + ["FORMAT"]: 9516 if sample in ped_samples: 9517 value = f'dataframe_barcode."{barcode_infos}"' 9518 value_samples = "'" + ",".join(ped_samples) + "'" 9519 elif sample == "FORMAT": 9520 value = f"'{tag}'" 9521 value_samples = f"'{tag}S'" 9522 else: 9523 value = "'.'" 9524 value_samples = "'.'" 9525 format_regex = r"[a-zA-Z0-9\s]" 9526 sql_update_set.append( 9527 f""" 9528 "{sample}" = 9529 concat( 9530 CASE 9531 WHEN {table_variants}."{sample}" = './.' 9532 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9533 ELSE {table_variants}."{sample}" 9534 END, 9535 ':', 9536 {value}, 9537 ':', 9538 {value_samples} 9539 ) 9540 """ 9541 ) 9542 9543 sql_update_set_join = ", ".join(sql_update_set) 9544 sql_update = f""" 9545 UPDATE {table_variants} 9546 SET {sql_update_set_join} 9547 FROM dataframe_barcode 9548 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9549 """ 9550 self.conn.execute(sql_update) 9551 9552 # Remove added columns 9553 for added_column in added_columns: 9554 self.drop_column(column=added_column) 9555 9556 # Delete dataframe 9557 del dataframe_barcode 9558 gc.collect() 9559 9560 def calculation_trio(self) -> None: 9561 """ 9562 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9563 information to the INFO field of each variant. 9564 """ 9565 9566 # if FORMAT and samples 9567 if ( 9568 "FORMAT" in self.get_header_columns_as_list() 9569 and self.get_header_sample_list() 9570 ): 9571 9572 # trio annotation field 9573 trio_tag = "trio" 9574 9575 # VCF infos tags 9576 vcf_infos_tags = { 9577 "trio": "trio calculation", 9578 } 9579 9580 # Param 9581 param = self.get_param() 9582 9583 # Prefix 9584 prefix = self.get_explode_infos_prefix() 9585 9586 # Trio param 9587 trio_ped = ( 9588 param.get("calculation", {}) 9589 .get("calculations", {}) 9590 .get("TRIO", {}) 9591 .get("trio_pedigree", None) 9592 ) 9593 9594 # Load trio 9595 if trio_ped: 9596 9597 # Trio pedigree is a file 9598 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9599 log.debug("TRIO pedigree is file") 9600 with open(full_path(trio_ped)) as trio_ped: 9601 trio_ped = json.load(trio_ped) 9602 9603 # Trio pedigree is a string 9604 elif isinstance(trio_ped, str): 9605 log.debug("TRIO pedigree is str") 9606 try: 9607 trio_ped = json.loads(trio_ped) 9608 log.debug("TRIO pedigree is json str") 9609 except ValueError as e: 9610 trio_samples = trio_ped.split(",") 9611 if len(trio_samples) == 3: 9612 trio_ped = { 9613 "father": trio_samples[0], 9614 "mother": trio_samples[1], 9615 "child": trio_samples[2], 9616 } 9617 log.debug("TRIO pedigree is list str") 9618 else: 9619 msg_error = "TRIO pedigree not well formatted" 9620 log.error(msg_error) 9621 raise ValueError(msg_error) 9622 9623 # Trio pedigree is a dict 9624 elif isinstance(trio_ped, dict): 9625 log.debug("TRIO pedigree is dict") 9626 9627 # Trio pedigree is not well formatted 9628 else: 9629 msg_error = "TRIO pedigree not well formatted" 9630 log.error(msg_error) 9631 raise ValueError(msg_error) 9632 9633 # Construct trio list 9634 trio_samples = [ 9635 trio_ped.get("father", ""), 9636 trio_ped.get("mother", ""), 9637 trio_ped.get("child", ""), 9638 ] 9639 9640 else: 9641 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9642 samples_list = self.get_header_sample_list() 9643 if len(samples_list) >= 3: 9644 trio_samples = self.get_header_sample_list()[0:3] 9645 trio_ped = { 9646 "father": trio_samples[0], 9647 "mother": trio_samples[1], 9648 "child": trio_samples[2], 9649 } 9650 else: 9651 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9652 log.error(msg_error) 9653 raise ValueError(msg_error) 9654 9655 # Check trio pedigree 9656 if not trio_ped or len(trio_ped) != 3: 9657 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9658 log.error(msg_error) 9659 raise ValueError(msg_error) 9660 9661 # Log 9662 log.info( 9663 f"Calculation 'TRIO' - Samples: " 9664 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9665 ) 9666 9667 # Field 9668 trio_infos = prefix + trio_tag 9669 9670 # Variants table 9671 table_variants = self.get_table_variants() 9672 9673 # Header 9674 vcf_reader = self.get_header() 9675 9676 # Create variant id 9677 variant_id_column = self.get_variant_id_column() 9678 added_columns = [variant_id_column] 9679 9680 # variant_id, FORMAT and samples 9681 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9682 self.get_header_sample_list() 9683 ) 9684 9685 # Create dataframe 9686 dataframe_trio = self.get_query_to_df( 9687 f""" SELECT {samples_fields} FROM {table_variants} """ 9688 ) 9689 9690 # Create trio column 9691 dataframe_trio[trio_infos] = dataframe_trio.apply( 9692 lambda row: trio(row, samples=trio_samples), axis=1 9693 ) 9694 9695 # Add trio to header 9696 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9697 trio_tag, 9698 ".", 9699 "String", 9700 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9701 "howard calculation", 9702 "0", 9703 self.code_type_map.get("String"), 9704 ) 9705 9706 # Update 9707 sql_update = f""" 9708 UPDATE {table_variants} 9709 SET "INFO" = 9710 concat( 9711 CASE 9712 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9713 THEN '' 9714 ELSE concat("INFO", ';') 9715 END, 9716 CASE 9717 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9718 AND dataframe_trio."{trio_infos}" NOT NULL 9719 THEN concat( 9720 '{trio_tag}=', 9721 dataframe_trio."{trio_infos}" 9722 ) 9723 ELSE '' 9724 END 9725 ) 9726 FROM dataframe_trio 9727 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9728 """ 9729 self.conn.execute(sql_update) 9730 9731 # Remove added columns 9732 for added_column in added_columns: 9733 self.drop_column(column=added_column) 9734 9735 # Delete dataframe 9736 del dataframe_trio 9737 gc.collect() 9738 9739 def calculation_vaf_normalization(self) -> None: 9740 """ 9741 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9742 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9743 :return: The function does not return anything. 9744 """ 9745 9746 # if FORMAT and samples 9747 if ( 9748 "FORMAT" in self.get_header_columns_as_list() 9749 and self.get_header_sample_list() 9750 ): 9751 9752 # vaf_normalization annotation field 9753 vaf_normalization_tag = "VAF" 9754 9755 # VCF infos tags 9756 vcf_infos_tags = { 9757 "VAF": "VAF Variant Frequency", 9758 } 9759 9760 # Prefix 9761 prefix = self.get_explode_infos_prefix() 9762 9763 # Variants table 9764 table_variants = self.get_table_variants() 9765 9766 # Header 9767 vcf_reader = self.get_header() 9768 9769 # Do not calculate if VAF already exists 9770 if "VAF" in vcf_reader.formats: 9771 log.debug("VAF already on genotypes") 9772 return 9773 9774 # Create variant id 9775 variant_id_column = self.get_variant_id_column() 9776 added_columns = [variant_id_column] 9777 9778 # variant_id, FORMAT and samples 9779 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9780 f""" "{sample}" """ for sample in self.get_header_sample_list() 9781 ) 9782 9783 # Create dataframe 9784 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9785 log.debug(f"query={query}") 9786 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9787 9788 vaf_normalization_set = [] 9789 9790 # for each sample vaf_normalization 9791 for sample in self.get_header_sample_list(): 9792 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9793 lambda row: vaf_normalization(row, sample=sample), axis=1 9794 ) 9795 vaf_normalization_set.append( 9796 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9797 ) 9798 9799 # Add VAF to FORMAT 9800 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9801 "FORMAT" 9802 ].apply(lambda x: str(x) + ":VAF") 9803 vaf_normalization_set.append( 9804 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9805 ) 9806 9807 # Add vaf_normalization to header 9808 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9809 id=vaf_normalization_tag, 9810 num="1", 9811 type="Float", 9812 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9813 type_code=self.code_type_map.get("Float"), 9814 ) 9815 9816 # Create fields to add in INFO 9817 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9818 9819 # Update 9820 sql_update = f""" 9821 UPDATE {table_variants} 9822 SET {sql_vaf_normalization_set} 9823 FROM dataframe_vaf_normalization 9824 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9825 9826 """ 9827 self.conn.execute(sql_update) 9828 9829 # Remove added columns 9830 for added_column in added_columns: 9831 self.drop_column(column=added_column) 9832 9833 # Delete dataframe 9834 del dataframe_vaf_normalization 9835 gc.collect() 9836 9837 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9838 """ 9839 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9840 field in a VCF file and updates the INFO column of the variants table with the calculated 9841 statistics. 9842 9843 :param info: The `info` parameter is a string that represents the type of information for which 9844 genotype statistics are calculated. It is used to generate various VCF info tags for the 9845 statistics, such as the number of occurrences, the list of values, the minimum value, the 9846 maximum value, the mean, the median, defaults to VAF 9847 :type info: str (optional) 9848 """ 9849 9850 # if FORMAT and samples 9851 if ( 9852 "FORMAT" in self.get_header_columns_as_list() 9853 and self.get_header_sample_list() 9854 ): 9855 9856 # vaf_stats annotation field 9857 vaf_stats_tag = info + "_stats" 9858 9859 # VCF infos tags 9860 vcf_infos_tags = { 9861 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9862 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9863 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9864 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9865 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9866 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9867 info 9868 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9869 } 9870 9871 # Prefix 9872 prefix = self.get_explode_infos_prefix() 9873 9874 # Field 9875 vaf_stats_infos = prefix + vaf_stats_tag 9876 9877 # Variants table 9878 table_variants = self.get_table_variants() 9879 9880 # Header 9881 vcf_reader = self.get_header() 9882 9883 # Create variant id 9884 variant_id_column = self.get_variant_id_column() 9885 added_columns = [variant_id_column] 9886 9887 # variant_id, FORMAT and samples 9888 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9889 self.get_header_sample_list() 9890 ) 9891 9892 # Create dataframe 9893 dataframe_vaf_stats = self.get_query_to_df( 9894 f""" SELECT {samples_fields} FROM {table_variants} """ 9895 ) 9896 9897 # Create vaf_stats column 9898 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9899 lambda row: genotype_stats( 9900 row, samples=self.get_header_sample_list(), info=info 9901 ), 9902 axis=1, 9903 ) 9904 9905 # List of vcf tags 9906 sql_vaf_stats_fields = [] 9907 9908 # Check all VAF stats infos 9909 for stat in vcf_infos_tags: 9910 9911 # Extract stats 9912 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9913 lambda x: dict(x).get(stat, "") 9914 ) 9915 9916 # Add snpeff_hgvs to header 9917 vcf_reader.infos[stat] = vcf.parser._Info( 9918 stat, 9919 ".", 9920 "String", 9921 vcf_infos_tags.get(stat, "genotype statistics"), 9922 "howard calculation", 9923 "0", 9924 self.code_type_map.get("String"), 9925 ) 9926 9927 if len(sql_vaf_stats_fields): 9928 sep = ";" 9929 else: 9930 sep = "" 9931 9932 # Create fields to add in INFO 9933 sql_vaf_stats_fields.append( 9934 f""" 9935 CASE 9936 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9937 THEN concat( 9938 '{sep}{stat}=', 9939 dataframe_vaf_stats."{stat}" 9940 ) 9941 ELSE '' 9942 END 9943 """ 9944 ) 9945 9946 # SQL set for update 9947 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9948 9949 # Update 9950 sql_update = f""" 9951 UPDATE {table_variants} 9952 SET "INFO" = 9953 concat( 9954 CASE 9955 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9956 THEN '' 9957 ELSE concat("INFO", ';') 9958 END, 9959 {sql_vaf_stats_fields_set} 9960 ) 9961 FROM dataframe_vaf_stats 9962 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9963 9964 """ 9965 self.conn.execute(sql_update) 9966 9967 # Remove added columns 9968 for added_column in added_columns: 9969 self.drop_column(column=added_column) 9970 9971 # Delete dataframe 9972 del dataframe_vaf_stats 9973 gc.collect() 9974 9975 def calculation_transcripts_annotation( 9976 self, info_json: str = None, info_format: str = None 9977 ) -> None: 9978 """ 9979 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9980 field to it if transcripts are available. 9981 9982 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9983 is a string parameter that represents the information field to be used in the transcripts JSON. 9984 It is used to specify the JSON format for the transcripts information. If no value is provided 9985 when calling the method, it defaults to " 9986 :type info_json: str 9987 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9988 method is a string parameter that specifies the format of the information field to be used in 9989 the transcripts JSON. It is used to define the format of the information field 9990 :type info_format: str 9991 """ 9992 9993 # Create transcripts table 9994 transcripts_table = self.create_transcript_view() 9995 9996 # Add info field 9997 if transcripts_table: 9998 self.transcript_view_to_variants( 9999 transcripts_table=transcripts_table, 10000 transcripts_info_field_json=info_json, 10001 transcripts_info_field_format=info_format, 10002 ) 10003 else: 10004 log.info("No Transcripts to process. Check param.json file configuration") 10005 10006 def calculation_transcripts_prioritization(self) -> None: 10007 """ 10008 The function `calculation_transcripts_prioritization` creates a transcripts table and 10009 prioritizes transcripts based on certain criteria. 10010 """ 10011 10012 # Create transcripts table 10013 transcripts_table = self.create_transcript_view() 10014 10015 # Add info field 10016 if transcripts_table: 10017 self.transcripts_prioritization(transcripts_table=transcripts_table) 10018 else: 10019 log.info("No Transcripts to process. Check param.json file configuration") 10020 10021 def calculation_transcripts_export(self) -> None: 10022 """ """ 10023 10024 # Create transcripts table 10025 transcripts_table = self.create_transcript_view() 10026 10027 # Add info field 10028 if transcripts_table: 10029 self.transcripts_export(transcripts_table=transcripts_table) 10030 else: 10031 log.info("No Transcripts to process. Check param.json file configuration") 10032 10033 ############### 10034 # Transcripts # 10035 ############### 10036 10037 def transcripts_export( 10038 self, transcripts_table: str = None, param: dict = {} 10039 ) -> bool: 10040 """ """ 10041 10042 log.debug("Start transcripts export...") 10043 10044 # Param 10045 if not param: 10046 param = self.get_param() 10047 10048 # Param export 10049 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10050 10051 # Output file 10052 transcripts_export_output = param_transcript_export.get("output", None) 10053 10054 if not param_transcript_export or not transcripts_export_output: 10055 log.warning(f"No transcriipts export parameters defined!") 10056 return False 10057 10058 # List of transcripts annotations 10059 query_describe = f""" 10060 SELECT column_name 10061 FROM ( 10062 DESCRIBE SELECT * FROM {transcripts_table} 10063 ) 10064 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10065 """ 10066 transcripts_annotations_list = list( 10067 self.get_query_to_df(query=query_describe)["column_name"] 10068 ) 10069 10070 # Create transcripts table for export 10071 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10072 random.choices(string.ascii_uppercase + string.digits, k=10) 10073 ) 10074 query_create_transcripts_table_export = f""" 10075 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10076 """ 10077 self.execute_query(query=query_create_transcripts_table_export) 10078 10079 # Output file format 10080 transcripts_export_output_format = get_file_format( 10081 filename=transcripts_export_output 10082 ) 10083 10084 # Format VCF - construct INFO 10085 if transcripts_export_output_format in ["vcf"]: 10086 10087 # Construct query update INFO and header 10088 query_update_info = [] 10089 for field in transcripts_annotations_list: 10090 10091 # If field not in header 10092 if field not in self.get_header_infos_list(): 10093 10094 # Add PZ Transcript in header 10095 self.get_header().infos[field] = vcf.parser._Info( 10096 field, 10097 ".", 10098 "String", 10099 f"Annotation '{field}' from transcript view", 10100 "unknown", 10101 "unknown", 10102 0, 10103 ) 10104 10105 # Add field as INFO/tag 10106 query_update_info.append( 10107 f""" 10108 CASE 10109 WHEN "{field}" IS NOT NULL 10110 THEN concat('{field}=', "{field}", ';') 10111 ELSE '' 10112 END 10113 """ 10114 ) 10115 10116 # Query param 10117 query_update_info_value = ( 10118 f""" concat('', {", ".join(query_update_info)}) """ 10119 ) 10120 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10121 10122 else: 10123 10124 # Query param 10125 query_update_info_value = f""" NULL """ 10126 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10127 10128 # Update query INFO column 10129 query_update = f""" 10130 UPDATE {transcripts_table_export} 10131 SET INFO = {query_update_info_value} 10132 10133 """ 10134 self.execute_query(query=query_update) 10135 10136 # Export 10137 self.export_output( 10138 output_file=transcripts_export_output, 10139 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10140 ) 10141 10142 # Drop transcripts export table 10143 query_drop_transcripts_table_export = f""" 10144 DROP TABLE {transcripts_table_export} 10145 """ 10146 self.execute_query(query=query_drop_transcripts_table_export) 10147 10148 def transcripts_prioritization( 10149 self, transcripts_table: str = None, param: dict = {} 10150 ) -> bool: 10151 """ 10152 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10153 and updates the variants table with the prioritized information. 10154 10155 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10156 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10157 This parameter is used to identify the table where the transcripts data is stored for the 10158 prioritization process 10159 :type transcripts_table: str 10160 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10161 that contains various configuration settings for the prioritization process of transcripts. It 10162 is used to customize the behavior of the prioritization algorithm and includes settings such as 10163 the prefix for prioritization fields, default profiles, and other 10164 :type param: dict 10165 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10166 transcripts prioritization process is successfully completed, and `False` if there are any 10167 issues or if no profile is defined for transcripts prioritization. 10168 """ 10169 10170 log.debug("Start transcripts prioritization...") 10171 10172 # Param 10173 if not param: 10174 param = self.get_param() 10175 10176 # Variants table 10177 table_variants = self.get_table_variants() 10178 10179 # Transcripts table 10180 if transcripts_table is None: 10181 transcripts_table = self.create_transcript_view( 10182 transcripts_table="transcripts", param=param 10183 ) 10184 if transcripts_table is None: 10185 msg_err = "No Transcripts table availalble" 10186 log.error(msg_err) 10187 raise ValueError(msg_err) 10188 log.debug(f"transcripts_table={transcripts_table}") 10189 10190 # Get transcripts columns 10191 columns_as_list_query = f""" 10192 DESCRIBE {transcripts_table} 10193 """ 10194 columns_as_list = list( 10195 self.get_query_to_df(columns_as_list_query)["column_name"] 10196 ) 10197 10198 # Create INFO if not exists 10199 if "INFO" not in columns_as_list: 10200 query_add_info = f""" 10201 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10202 """ 10203 self.execute_query(query_add_info) 10204 10205 # Prioritization param and Force only PZ Score and Flag 10206 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10207 10208 # PZ profile by default 10209 pz_profile_default = ( 10210 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10211 ) 10212 10213 # Exit if no profile 10214 if pz_profile_default is None: 10215 log.warning("No profile defined for transcripts prioritization") 10216 return False 10217 10218 # PZ fields 10219 pz_param_pzfields = {} 10220 10221 # PZ field transcripts 10222 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10223 10224 # Add PZ Transcript in header 10225 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10226 pz_fields_transcripts, 10227 ".", 10228 "String", 10229 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10230 "unknown", 10231 "unknown", 10232 code_type_map["String"], 10233 ) 10234 10235 # Mandatory fields 10236 pz_mandatory_fields_list = [ 10237 "Score", 10238 "Flag", 10239 "Tags", 10240 "Comment", 10241 "Infos", 10242 "Class", 10243 ] 10244 pz_mandatory_fields = [] 10245 for pz_mandatory_field in pz_mandatory_fields_list: 10246 pz_mandatory_fields.append( 10247 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10248 ) 10249 10250 # PZ fields in param 10251 for pz_field in pz_param.get("pzfields", []): 10252 if pz_field in pz_mandatory_fields_list: 10253 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10254 pz_param.get("pzprefix", "PTZ") + pz_field 10255 ) 10256 else: 10257 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10258 pz_param_pzfields[pz_field] = pz_field_new 10259 10260 # Add PZ Transcript in header 10261 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10262 pz_field_new, 10263 ".", 10264 "String", 10265 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10266 "unknown", 10267 "unknown", 10268 code_type_map["String"], 10269 ) 10270 10271 # PZ fields param 10272 pz_param["pzfields"] = pz_mandatory_fields 10273 10274 # Prioritization 10275 prioritization_result = self.prioritization( 10276 table=transcripts_table, 10277 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10278 ) 10279 if not prioritization_result: 10280 log.warning("Transcripts prioritization not processed") 10281 return False 10282 10283 # PZ fields sql query 10284 query_update_select_list = [] 10285 query_update_concat_list = [] 10286 query_update_order_list = [] 10287 for pz_param_pzfield in set( 10288 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10289 ): 10290 query_update_select_list.append(f" {pz_param_pzfield}, ") 10291 10292 for pz_param_pzfield in pz_param_pzfields: 10293 query_update_concat_list.append( 10294 f""" 10295 , CASE 10296 WHEN {pz_param_pzfield} IS NOT NULL 10297 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10298 ELSE '' 10299 END 10300 """ 10301 ) 10302 10303 # Order by 10304 pz_orders = ( 10305 param.get("transcripts", {}) 10306 .get("prioritization", {}) 10307 .get("prioritization_transcripts_order", {}) 10308 ) 10309 if not pz_orders: 10310 pz_orders = { 10311 pz_param.get("pzprefix", "PTZ") + "Flag": "ASC", 10312 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10313 } 10314 for pz_order in pz_orders: 10315 query_update_order_list.append( 10316 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10317 ) 10318 10319 # Fields to explode 10320 fields_to_explode = ( 10321 list(pz_param_pzfields.keys()) 10322 + pz_mandatory_fields 10323 + list(pz_orders.keys()) 10324 ) 10325 # Remove transcript column as a specific transcript column 10326 if "transcript" in fields_to_explode: 10327 fields_to_explode.remove("transcript") 10328 10329 # Fields intranscripts table 10330 query_transcripts_table = f""" 10331 DESCRIBE SELECT * FROM {transcripts_table} 10332 """ 10333 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10334 10335 # Check fields to explode 10336 for field_to_explode in fields_to_explode: 10337 if field_to_explode not in self.get_header_infos_list() + list( 10338 query_transcripts_table.column_name 10339 ): 10340 msg_err = f"INFO/{field_to_explode} NOT IN header" 10341 log.error(msg_err) 10342 raise ValueError(msg_err) 10343 10344 # Explode fields to explode 10345 self.explode_infos( 10346 table=transcripts_table, 10347 fields=fields_to_explode, 10348 ) 10349 10350 # Transcript preference file 10351 transcripts_preference_file = ( 10352 param.get("transcripts", {}) 10353 .get("prioritization", {}) 10354 .get("prioritization_transcripts", {}) 10355 ) 10356 transcripts_preference_file = full_path(transcripts_preference_file) 10357 10358 # Transcript preference forced 10359 transcript_preference_force = ( 10360 param.get("transcripts", {}) 10361 .get("prioritization", {}) 10362 .get("prioritization_transcripts_force", False) 10363 ) 10364 # Transcript version forced 10365 transcript_version_force = ( 10366 param.get("transcripts", {}) 10367 .get("prioritization", {}) 10368 .get("prioritization_transcripts_version_force", False) 10369 ) 10370 10371 # Transcripts Ranking 10372 if transcripts_preference_file: 10373 10374 # Transcripts file to dataframe 10375 if os.path.exists(transcripts_preference_file): 10376 transcripts_preference_dataframe = transcripts_file_to_df( 10377 transcripts_preference_file 10378 ) 10379 else: 10380 log.error( 10381 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10382 ) 10383 raise ValueError( 10384 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10385 ) 10386 10387 # Order by depending to transcript preference forcing 10388 if transcript_preference_force: 10389 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10390 else: 10391 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10392 10393 # Transcript columns joined depend on version consideration 10394 if transcript_version_force: 10395 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10396 else: 10397 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10398 10399 # Query ranking for update 10400 query_update_ranking = f""" 10401 SELECT 10402 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10403 ROW_NUMBER() OVER ( 10404 PARTITION BY "#CHROM", POS, REF, ALT 10405 ORDER BY {order_by} 10406 ) AS rn 10407 FROM {transcripts_table} 10408 LEFT JOIN 10409 ( 10410 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10411 FROM transcripts_preference_dataframe 10412 ) AS transcripts_preference 10413 ON {transcripts_version_join} 10414 """ 10415 10416 else: 10417 10418 # Query ranking for update 10419 query_update_ranking = f""" 10420 SELECT 10421 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10422 ROW_NUMBER() OVER ( 10423 PARTITION BY "#CHROM", POS, REF, ALT 10424 ORDER BY {" , ".join(query_update_order_list)} 10425 ) AS rn 10426 FROM {transcripts_table} 10427 """ 10428 10429 # Export Transcripts prioritization infos to variants table 10430 query_update = f""" 10431 WITH RankedTranscripts AS ( 10432 {query_update_ranking} 10433 ) 10434 UPDATE {table_variants} 10435 SET 10436 INFO = CONCAT(CASE 10437 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10438 THEN '' 10439 ELSE concat("INFO", ';') 10440 END, 10441 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10442 ) 10443 FROM 10444 RankedTranscripts 10445 WHERE 10446 rn = 1 10447 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10448 AND variants."POS" = RankedTranscripts."POS" 10449 AND variants."REF" = RankedTranscripts."REF" 10450 AND variants."ALT" = RankedTranscripts."ALT" 10451 """ 10452 10453 # log.debug(f"query_update={query_update}") 10454 self.execute_query(query=query_update) 10455 10456 # Return 10457 return True 10458 10459 def create_transcript_view_from_columns_map( 10460 self, 10461 transcripts_table: str = "transcripts", 10462 columns_maps: dict = {}, 10463 added_columns: list = [], 10464 temporary_tables: list = None, 10465 annotation_fields: list = None, 10466 column_rename: dict = {}, 10467 column_clean: bool = False, 10468 column_case: str = None, 10469 ) -> tuple[list, list, list]: 10470 """ 10471 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10472 specified columns mapping for transcripts data. 10473 10474 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10475 of the table where the transcripts data is stored or will be stored in the database. This table 10476 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10477 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10478 :type transcripts_table: str (optional) 10479 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10480 about how to map columns from a transcripts table to create a view. Each entry in the 10481 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10482 typically includes details such as the main transcript column and additional information columns 10483 :type columns_maps: dict 10484 :param added_columns: The `added_columns` parameter in the 10485 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10486 that will be added to the view being created based on the columns map provided. These columns 10487 are generated by exploding the transcript information columns along with the main transcript 10488 column 10489 :type added_columns: list 10490 :param temporary_tables: The `temporary_tables` parameter in the 10491 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10492 tables created during the process of creating a transcript view from a columns map. These 10493 temporary tables are used to store intermediate results or transformations before the final view 10494 is generated 10495 :type temporary_tables: list 10496 :param annotation_fields: The `annotation_fields` parameter in the 10497 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10498 used for annotation in the query view creation process. These fields are extracted from the 10499 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10500 :type annotation_fields: list 10501 :param column_rename: The `column_rename` parameter in the 10502 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10503 custom renaming for columns during the creation of the temporary table view. This parameter 10504 provides a mapping of original column names to the desired renamed column names. By using this 10505 parameter, 10506 :type column_rename: dict 10507 :param column_clean: The `column_clean` parameter in the 10508 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10509 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10510 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10511 False 10512 :type column_clean: bool (optional) 10513 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10514 function is used to specify the case transformation to be applied to the columns during the view 10515 creation process. It allows you to control whether the column values should be converted to 10516 lowercase, uppercase, or remain unchanged 10517 :type column_case: str 10518 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10519 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10520 """ 10521 10522 log.debug("Start transcrpts view creation from columns map...") 10523 10524 # "from_columns_map": [ 10525 # { 10526 # "transcripts_column": "Ensembl_transcriptid", 10527 # "transcripts_infos_columns": [ 10528 # "genename", 10529 # "Ensembl_geneid", 10530 # "LIST_S2_score", 10531 # "LIST_S2_pred", 10532 # ], 10533 # }, 10534 # { 10535 # "transcripts_column": "Ensembl_transcriptid", 10536 # "transcripts_infos_columns": [ 10537 # "genename", 10538 # "VARITY_R_score", 10539 # "Aloft_pred", 10540 # ], 10541 # }, 10542 # ], 10543 10544 # Init 10545 if temporary_tables is None: 10546 temporary_tables = [] 10547 if annotation_fields is None: 10548 annotation_fields = [] 10549 10550 # Variants table 10551 table_variants = self.get_table_variants() 10552 10553 for columns_map in columns_maps: 10554 10555 # Transcript column 10556 transcripts_column = columns_map.get("transcripts_column", None) 10557 10558 # Transcripts infos columns 10559 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10560 10561 # Transcripts infos columns rename 10562 column_rename = columns_map.get("column_rename", column_rename) 10563 10564 # Transcripts infos columns clean 10565 column_clean = columns_map.get("column_clean", column_clean) 10566 10567 # Transcripts infos columns case 10568 column_case = columns_map.get("column_case", column_case) 10569 10570 if transcripts_column is not None: 10571 10572 # Explode 10573 added_columns += self.explode_infos( 10574 fields=[transcripts_column] + transcripts_infos_columns 10575 ) 10576 10577 # View clauses 10578 clause_select_variants = [] 10579 clause_select_tanscripts = [] 10580 for field in [transcripts_column] + transcripts_infos_columns: 10581 10582 # AS field 10583 as_field = field 10584 10585 # Rename 10586 if column_rename: 10587 as_field = column_rename.get(as_field, as_field) 10588 10589 # Clean 10590 if column_clean: 10591 as_field = clean_annotation_field(as_field) 10592 10593 # Case 10594 if column_case: 10595 if column_case.lower() in ["lower"]: 10596 as_field = as_field.lower() 10597 elif column_case.lower() in ["upper"]: 10598 as_field = as_field.upper() 10599 10600 # Clause select Variants 10601 clause_select_variants.append( 10602 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10603 ) 10604 10605 if field in [transcripts_column]: 10606 clause_select_tanscripts.append( 10607 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10608 ) 10609 else: 10610 clause_select_tanscripts.append( 10611 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10612 ) 10613 annotation_fields.append(as_field) 10614 10615 # Querey View 10616 query = f""" 10617 SELECT 10618 "#CHROM", POS, REF, ALT, INFO, 10619 "{transcripts_column}" AS 'transcript', 10620 {", ".join(clause_select_tanscripts)} 10621 FROM ( 10622 SELECT 10623 "#CHROM", POS, REF, ALT, INFO, 10624 {", ".join(clause_select_variants)} 10625 FROM {table_variants} 10626 ) 10627 WHERE "{transcripts_column}" IS NOT NULL 10628 """ 10629 10630 # Create temporary table 10631 temporary_table = transcripts_table + "".join( 10632 random.choices(string.ascii_uppercase + string.digits, k=10) 10633 ) 10634 10635 # Temporary_tables 10636 temporary_tables.append(temporary_table) 10637 query_view = f""" 10638 CREATE TEMPORARY TABLE {temporary_table} 10639 AS ({query}) 10640 """ 10641 self.execute_query(query=query_view) 10642 10643 return added_columns, temporary_tables, annotation_fields 10644 10645 def create_transcript_view_from_column_format( 10646 self, 10647 transcripts_table: str = "transcripts", 10648 column_formats: dict = {}, 10649 temporary_tables: list = None, 10650 annotation_fields: list = None, 10651 column_rename: dict = {}, 10652 column_clean: bool = False, 10653 column_case: str = None, 10654 ) -> tuple[list, list, list]: 10655 """ 10656 The `create_transcript_view_from_column_format` function generates a transcript view based on 10657 specified column formats, adds additional columns and annotation fields, and returns the list of 10658 temporary tables and annotation fields. 10659 10660 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10661 of the table containing the transcripts data. This table will be used as the base table for 10662 creating the transcript view. The default value for this parameter is "transcripts", but you can 10663 provide a different table name if needed, defaults to transcripts 10664 :type transcripts_table: str (optional) 10665 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10666 about the columns to be used for creating the transcript view. Each entry in the dictionary 10667 specifies the mapping between a transcripts column and a transcripts infos column. This 10668 parameter allows you to define how the columns from the transcripts table should be transformed 10669 or mapped 10670 :type column_formats: dict 10671 :param temporary_tables: The `temporary_tables` parameter in the 10672 `create_transcript_view_from_column_format` function is a list that stores the names of 10673 temporary views created during the process of creating a transcript view from a column format. 10674 These temporary views are used to manipulate and extract data before generating the final 10675 transcript view 10676 :type temporary_tables: list 10677 :param annotation_fields: The `annotation_fields` parameter in the 10678 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10679 that are extracted from the temporary views created during the process. These annotation fields 10680 are obtained by querying the temporary views and extracting the column names excluding specific 10681 columns like `#CH 10682 :type annotation_fields: list 10683 :param column_rename: The `column_rename` parameter in the 10684 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10685 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10686 column names to new column names in this dictionary, you can rename specific columns during the 10687 process 10688 :type column_rename: dict 10689 :param column_clean: The `column_clean` parameter in the 10690 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10691 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10692 will be cleaned during the creation of the transcript view based on the specified column format, 10693 defaults to False 10694 :type column_clean: bool (optional) 10695 :param column_case: The `column_case` parameter in the 10696 `create_transcript_view_from_column_format` function is used to specify the case transformation 10697 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10698 to convert the column names to uppercase or lowercase, respectively 10699 :type column_case: str 10700 :return: The `create_transcript_view_from_column_format` function returns two lists: 10701 `temporary_tables` and `annotation_fields`. 10702 """ 10703 10704 log.debug("Start transcrpts view creation from column format...") 10705 10706 # "from_column_format": [ 10707 # { 10708 # "transcripts_column": "ANN", 10709 # "transcripts_infos_column": "Feature_ID", 10710 # } 10711 # ], 10712 10713 # Init 10714 if temporary_tables is None: 10715 temporary_tables = [] 10716 if annotation_fields is None: 10717 annotation_fields = [] 10718 10719 for column_format in column_formats: 10720 10721 # annotation field and transcript annotation field 10722 annotation_field = column_format.get("transcripts_column", "ANN") 10723 transcript_annotation = column_format.get( 10724 "transcripts_infos_column", "Feature_ID" 10725 ) 10726 10727 # Transcripts infos columns rename 10728 column_rename = column_format.get("column_rename", column_rename) 10729 10730 # Transcripts infos columns clean 10731 column_clean = column_format.get("column_clean", column_clean) 10732 10733 # Transcripts infos columns case 10734 column_case = column_format.get("column_case", column_case) 10735 10736 # Temporary View name 10737 temporary_view_name = transcripts_table + "".join( 10738 random.choices(string.ascii_uppercase + string.digits, k=10) 10739 ) 10740 10741 # Create temporary view name 10742 temporary_view_name = self.annotation_format_to_table( 10743 uniquify=True, 10744 annotation_field=annotation_field, 10745 view_name=temporary_view_name, 10746 annotation_id=transcript_annotation, 10747 column_rename=column_rename, 10748 column_clean=column_clean, 10749 column_case=column_case, 10750 ) 10751 10752 # Annotation fields 10753 if temporary_view_name: 10754 query_annotation_fields = f""" 10755 SELECT * 10756 FROM ( 10757 DESCRIBE SELECT * 10758 FROM {temporary_view_name} 10759 ) 10760 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10761 """ 10762 df_annotation_fields = self.get_query_to_df( 10763 query=query_annotation_fields 10764 ) 10765 10766 # Add temporary view and annotation fields 10767 temporary_tables.append(temporary_view_name) 10768 annotation_fields += list(set(df_annotation_fields["column_name"])) 10769 10770 return temporary_tables, annotation_fields 10771 10772 def create_transcript_view( 10773 self, 10774 transcripts_table: str = None, 10775 transcripts_table_drop: bool = True, 10776 param: dict = {}, 10777 ) -> str: 10778 """ 10779 The `create_transcript_view` function generates a transcript view by processing data from a 10780 specified table based on provided parameters and structural information. 10781 10782 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10783 is used to specify the name of the table that will store the final transcript view data. If a table 10784 name is not provided, the function will create a new table to store the transcript view data, and by 10785 default,, defaults to transcripts 10786 :type transcripts_table: str (optional) 10787 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10788 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10789 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10790 the function will drop the existing transcripts table if it exists, defaults to True 10791 :type transcripts_table_drop: bool (optional) 10792 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10793 contains information needed to create a transcript view. It includes details such as the structure 10794 of the transcripts, columns mapping, column formats, and other necessary information for generating 10795 the view. This parameter allows for flexibility and customization 10796 :type param: dict 10797 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10798 created or modified during the execution of the function. 10799 """ 10800 10801 log.debug("Start transcripts view creation...") 10802 10803 # Default 10804 transcripts_table_default = "transcripts" 10805 10806 # Param 10807 if not param: 10808 param = self.get_param() 10809 10810 # Struct 10811 struct = param.get("transcripts", {}).get("struct", None) 10812 10813 # Transcript veresion 10814 transcript_id_remove_version = param.get("transcripts", {}).get( 10815 "transcript_id_remove_version", False 10816 ) 10817 10818 # Transcripts mapping 10819 transcript_id_mapping_file = param.get("transcripts", {}).get( 10820 "transcript_id_mapping_file", None 10821 ) 10822 10823 # Transcripts mapping 10824 transcript_id_mapping_force = param.get("transcripts", {}).get( 10825 "transcript_id_mapping_force", None 10826 ) 10827 10828 if struct: 10829 10830 # Transcripts table 10831 if transcripts_table is None: 10832 transcripts_table = param.get("transcripts", {}).get( 10833 "table", transcripts_table_default 10834 ) 10835 10836 # added_columns 10837 added_columns = [] 10838 10839 # Temporary tables 10840 temporary_tables = [] 10841 10842 # Annotation fields 10843 annotation_fields = [] 10844 10845 # from columns map 10846 columns_maps = struct.get("from_columns_map", []) 10847 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10848 self.create_transcript_view_from_columns_map( 10849 transcripts_table=transcripts_table, 10850 columns_maps=columns_maps, 10851 added_columns=added_columns, 10852 temporary_tables=temporary_tables, 10853 annotation_fields=annotation_fields, 10854 ) 10855 ) 10856 added_columns += added_columns_tmp 10857 temporary_tables += temporary_tables_tmp 10858 annotation_fields += annotation_fields_tmp 10859 10860 # from column format 10861 column_formats = struct.get("from_column_format", []) 10862 temporary_tables_tmp, annotation_fields_tmp = ( 10863 self.create_transcript_view_from_column_format( 10864 transcripts_table=transcripts_table, 10865 column_formats=column_formats, 10866 temporary_tables=temporary_tables, 10867 annotation_fields=annotation_fields, 10868 ) 10869 ) 10870 temporary_tables += temporary_tables_tmp 10871 annotation_fields += annotation_fields_tmp 10872 10873 # Remove some specific fields/column 10874 annotation_fields = list(set(annotation_fields)) 10875 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10876 if field in annotation_fields: 10877 annotation_fields.remove(field) 10878 10879 # Merge temporary tables query 10880 query_merge = "" 10881 for temporary_table in list(set(temporary_tables)): 10882 10883 # First temporary table 10884 if not query_merge: 10885 query_merge = f""" 10886 SELECT * FROM {temporary_table} 10887 """ 10888 # other temporary table (using UNION) 10889 else: 10890 query_merge += f""" 10891 UNION BY NAME SELECT * FROM {temporary_table} 10892 """ 10893 10894 # transcript table tmp 10895 transcript_table_tmp = "transcripts_tmp" 10896 transcript_table_tmp2 = "transcripts_tmp2" 10897 transcript_table_tmp3 = "transcripts_tmp3" 10898 10899 # Merge on transcript 10900 query_merge_on_transcripts_annotation_fields = [] 10901 10902 # Add transcript list 10903 query_merge_on_transcripts_annotation_fields.append( 10904 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 10905 ) 10906 10907 # Aggregate all annotations fields 10908 for annotation_field in set(annotation_fields): 10909 query_merge_on_transcripts_annotation_fields.append( 10910 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10911 ) 10912 10913 # Transcripts mapping 10914 if transcript_id_mapping_file: 10915 10916 # Transcript dataframe 10917 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 10918 transcript_id_mapping_dataframe = transcripts_file_to_df( 10919 transcript_id_mapping_file, column_names=["transcript", "alias"] 10920 ) 10921 10922 # Transcript version remove 10923 if transcript_id_remove_version: 10924 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 10925 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 10926 query_left_join = f""" 10927 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10928 """ 10929 else: 10930 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 10931 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 10932 query_left_join = f""" 10933 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10934 """ 10935 10936 # Transcript column for group by merge 10937 query_transcript_merge_group_by = """ 10938 CASE 10939 WHEN transcript_mapped NOT IN ('') 10940 THEN split_part(transcript_mapped, '.', 1) 10941 ELSE split_part(transcript_original, '.', 1) 10942 END 10943 """ 10944 10945 # Merge query 10946 transcripts_tmp2_query = f""" 10947 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 10948 FROM ({query_merge}) AS {transcript_table_tmp} 10949 {query_left_join} 10950 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 10951 """ 10952 10953 # Retrive columns after mege 10954 transcripts_tmp2_describe_query = f""" 10955 DESCRIBE {transcripts_tmp2_query} 10956 """ 10957 transcripts_tmp2_describe_list = list( 10958 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 10959 "column_name" 10960 ] 10961 ) 10962 10963 # Create list of columns for select clause 10964 transcripts_tmp2_describe_select_clause = [] 10965 for field in transcripts_tmp2_describe_list: 10966 if field not in [ 10967 "#CHROM", 10968 "POS", 10969 "REF", 10970 "ALT", 10971 "INFO", 10972 "transcript_mapped", 10973 ]: 10974 as_field = field 10975 if field in ["transcript_original"]: 10976 as_field = "transcripts_mapped" 10977 transcripts_tmp2_describe_select_clause.append( 10978 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 10979 ) 10980 10981 # Merge with mapping 10982 query_merge_on_transcripts = f""" 10983 SELECT 10984 "#CHROM", POS, REF, ALT, INFO, 10985 CASE 10986 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 10987 THEN ANY_VALUE(transcript_mapped) 10988 ELSE ANY_VALUE(transcript_original) 10989 END AS transcript, 10990 {", ".join(transcripts_tmp2_describe_select_clause)} 10991 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 10992 GROUP BY "#CHROM", POS, REF, ALT, INFO, 10993 {query_transcript_merge_group_by} 10994 """ 10995 10996 # Add transcript filter from mapping file 10997 if transcript_id_mapping_force: 10998 query_merge_on_transcripts = f""" 10999 SELECT * 11000 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11001 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11002 """ 11003 11004 # No transcript mapping 11005 else: 11006 11007 # Remove transcript version 11008 if transcript_id_remove_version: 11009 query_transcript_column = f""" 11010 split_part({transcript_table_tmp}.transcript, '.', 1) 11011 """ 11012 else: 11013 query_transcript_column = """ 11014 transcript 11015 """ 11016 11017 # Query sections 11018 query_transcript_column_select = ( 11019 f"{query_transcript_column} AS transcript" 11020 ) 11021 query_transcript_column_group_by = query_transcript_column 11022 11023 # Query for transcripts view 11024 query_merge_on_transcripts = f""" 11025 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11026 FROM ({query_merge}) AS {transcript_table_tmp} 11027 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11028 """ 11029 11030 log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}") 11031 11032 # Drop transcript view is necessary 11033 if transcripts_table_drop: 11034 query_drop = f""" 11035 DROP TABLE IF EXISTS {transcripts_table}; 11036 """ 11037 self.execute_query(query=query_drop) 11038 11039 # Merge and create transcript view 11040 query_create_view = f""" 11041 CREATE TABLE IF NOT EXISTS {transcripts_table} 11042 AS {query_merge_on_transcripts} 11043 """ 11044 self.execute_query(query=query_create_view) 11045 11046 # Remove added columns 11047 for added_column in added_columns: 11048 self.drop_column(column=added_column) 11049 11050 else: 11051 11052 transcripts_table = None 11053 11054 return transcripts_table 11055 11056 def annotation_format_to_table( 11057 self, 11058 uniquify: bool = True, 11059 annotation_field: str = "ANN", 11060 annotation_id: str = "Feature_ID", 11061 view_name: str = "transcripts", 11062 column_rename: dict = {}, 11063 column_clean: bool = False, 11064 column_case: str = None, 11065 ) -> str: 11066 """ 11067 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11068 structured table format, ensuring unique values and creating a temporary table for further 11069 processing or analysis. 11070 11071 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11072 unique values in the output or not. If set to `True`, the function will make sure that the 11073 output values are unique, defaults to True 11074 :type uniquify: bool (optional) 11075 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11076 that contains the annotation information for each variant. This field is used to extract the 11077 annotation details for further processing in the function. By default, it is set to "ANN", 11078 defaults to ANN 11079 :type annotation_field: str (optional) 11080 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11081 is used to specify the identifier for the annotation feature. This identifier will be used as a 11082 column name in the resulting table or view that is created based on the annotation data. It 11083 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11084 :type annotation_id: str (optional) 11085 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11086 to specify the name of the temporary table that will be created to store the transformed 11087 annotation data. This table will hold the extracted information from the annotation field in a 11088 structured format for further processing or analysis. By default,, defaults to transcripts 11089 :type view_name: str (optional) 11090 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11091 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11092 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11093 created based on the annotation data. This feature enables 11094 :type column_rename: dict 11095 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11096 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11097 If set to `True`, the function will clean the annotation field before further processing. This 11098 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11099 to False 11100 :type column_clean: bool (optional) 11101 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11102 used to specify the case transformation to be applied to the column names extracted from the 11103 annotation data. It allows you to set the case of the column names to either lowercase or 11104 uppercase for consistency or other specific requirements during the conversion 11105 :type column_case: str 11106 :return: The function `annotation_format_to_table` is returning the name of the view created, 11107 which is stored in the variable `view_name`. 11108 """ 11109 11110 # Annotation field 11111 annotation_format = "annotation_explode" 11112 11113 # Transcript annotation 11114 if column_rename: 11115 annotation_id = column_rename.get(annotation_id, annotation_id) 11116 11117 if column_clean: 11118 annotation_id = clean_annotation_field(annotation_id) 11119 11120 # Prefix 11121 prefix = self.get_explode_infos_prefix() 11122 if prefix: 11123 prefix = "INFO/" 11124 11125 # Annotation fields 11126 annotation_infos = prefix + annotation_field 11127 annotation_format_infos = prefix + annotation_format 11128 11129 # Variants table 11130 table_variants = self.get_table_variants() 11131 11132 # Header 11133 vcf_reader = self.get_header() 11134 11135 # Add columns 11136 added_columns = [] 11137 11138 # Explode HGVS field in column 11139 added_columns += self.explode_infos(fields=[annotation_field]) 11140 11141 if annotation_field in vcf_reader.infos: 11142 11143 # Extract ANN header 11144 ann_description = vcf_reader.infos[annotation_field].desc 11145 pattern = r"'(.+?)'" 11146 match = re.search(pattern, ann_description) 11147 if match: 11148 ann_header_match = match.group(1).split(" | ") 11149 ann_header = [] 11150 ann_header_desc = {} 11151 for i in range(len(ann_header_match)): 11152 ann_header_info = "".join( 11153 char for char in ann_header_match[i] if char.isalnum() 11154 ) 11155 ann_header.append(ann_header_info) 11156 ann_header_desc[ann_header_info] = ann_header_match[i] 11157 if not ann_header_desc: 11158 raise ValueError("Invalid header description format") 11159 else: 11160 raise ValueError("Invalid header description format") 11161 11162 # Create variant id 11163 variant_id_column = self.get_variant_id_column() 11164 added_columns += [variant_id_column] 11165 11166 # Create dataframe 11167 dataframe_annotation_format = self.get_query_to_df( 11168 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 11169 ) 11170 11171 # Create annotation columns 11172 dataframe_annotation_format[ 11173 annotation_format_infos 11174 ] = dataframe_annotation_format[annotation_infos].apply( 11175 lambda x: explode_annotation_format( 11176 annotation=str(x), 11177 uniquify=uniquify, 11178 output_format="JSON", 11179 prefix="", 11180 header=list(ann_header_desc.values()), 11181 ) 11182 ) 11183 11184 # Find keys 11185 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 11186 df_keys = self.get_query_to_df(query=query_json) 11187 11188 # Check keys 11189 query_json_key = [] 11190 for _, row in df_keys.iterrows(): 11191 11192 # Key 11193 key = row.iloc[0] 11194 key_clean = key 11195 11196 # key rename 11197 if column_rename: 11198 key_clean = column_rename.get(key_clean, key_clean) 11199 11200 # key clean 11201 if column_clean: 11202 key_clean = clean_annotation_field(key_clean) 11203 11204 # Key case 11205 if column_case: 11206 if column_case.lower() in ["lower"]: 11207 key_clean = key_clean.lower() 11208 elif column_case.lower() in ["upper"]: 11209 key_clean = key_clean.upper() 11210 11211 # Type 11212 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 11213 11214 # Get DataFrame from query 11215 df_json_type = self.get_query_to_df(query=query_json_type) 11216 11217 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 11218 with pd.option_context("future.no_silent_downcasting", True): 11219 df_json_type.fillna(value="", inplace=True) 11220 replace_dict = {None: np.nan, "": np.nan} 11221 df_json_type.replace(replace_dict, inplace=True) 11222 df_json_type.dropna(inplace=True) 11223 11224 # Detect column type 11225 column_type = detect_column_type(df_json_type[key_clean]) 11226 11227 # Append 11228 query_json_key.append( 11229 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11230 ) 11231 11232 # Create view 11233 query_view = f""" 11234 CREATE TEMPORARY TABLE {view_name} 11235 AS ( 11236 SELECT *, {annotation_id} AS 'transcript' 11237 FROM ( 11238 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11239 FROM dataframe_annotation_format 11240 ) 11241 ); 11242 """ 11243 self.execute_query(query=query_view) 11244 11245 else: 11246 11247 # Return None 11248 view_name = None 11249 11250 # Remove added columns 11251 for added_column in added_columns: 11252 self.drop_column(column=added_column) 11253 11254 return view_name 11255 11256 def transcript_view_to_variants( 11257 self, 11258 transcripts_table: str = None, 11259 transcripts_column_id: str = None, 11260 transcripts_info_json: str = None, 11261 transcripts_info_field_json: str = None, 11262 transcripts_info_format: str = None, 11263 transcripts_info_field_format: str = None, 11264 param: dict = {}, 11265 ) -> bool: 11266 """ 11267 The `transcript_view_to_variants` function updates a variants table with information from 11268 transcripts in JSON format. 11269 11270 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11271 table containing the transcripts data. If this parameter is not provided, the function will 11272 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11273 :type transcripts_table: str 11274 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11275 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11276 identifier is used to match transcripts with variants in the database 11277 :type transcripts_column_id: str 11278 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11279 of the column in the variants table where the transcripts information will be stored in JSON 11280 format. This parameter allows you to define the column in the variants table that will hold the 11281 JSON-formatted information about transcripts 11282 :type transcripts_info_json: str 11283 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11284 specify the field in the VCF header that will contain information about transcripts in JSON 11285 format. This field will be added to the VCF header as an INFO field with the specified name 11286 :type transcripts_info_field_json: str 11287 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11288 format of the information about transcripts that will be stored in the variants table. This 11289 format can be used to define how the transcript information will be structured or displayed 11290 within the variants table 11291 :type transcripts_info_format: str 11292 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11293 specify the field in the VCF header that will contain information about transcripts in a 11294 specific format. This field will be added to the VCF header as an INFO field with the specified 11295 name 11296 :type transcripts_info_field_format: str 11297 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11298 that contains various configuration settings related to transcripts. It is used to provide 11299 default values for certain parameters if they are not explicitly provided when calling the 11300 method. The `param` dictionary can be passed as an argument 11301 :type param: dict 11302 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11303 if the operation is successful and `False` if certain conditions are not met. 11304 """ 11305 11306 msg_info_prefix = "Start transcripts view to variants annotations" 11307 11308 log.debug(f"{msg_info_prefix}...") 11309 11310 # Default 11311 transcripts_table_default = "transcripts" 11312 transcripts_column_id_default = "transcript" 11313 transcripts_info_json_default = None 11314 transcripts_info_format_default = None 11315 transcripts_info_field_json_default = None 11316 transcripts_info_field_format_default = None 11317 11318 # Param 11319 if not param: 11320 param = self.get_param() 11321 11322 # Transcripts table 11323 if transcripts_table is None: 11324 transcripts_table = param.get("transcripts", {}).get( 11325 "table", transcripts_table_default 11326 ) 11327 11328 # Transcripts column ID 11329 if transcripts_column_id is None: 11330 transcripts_column_id = param.get("transcripts", {}).get( 11331 "column_id", transcripts_column_id_default 11332 ) 11333 11334 # Transcripts info json 11335 if transcripts_info_json is None: 11336 transcripts_info_json = param.get("transcripts", {}).get( 11337 "transcripts_info_json", transcripts_info_json_default 11338 ) 11339 11340 # Transcripts info field JSON 11341 if transcripts_info_field_json is None: 11342 transcripts_info_field_json = param.get("transcripts", {}).get( 11343 "transcripts_info_field_json", transcripts_info_field_json_default 11344 ) 11345 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11346 # transcripts_info_json = transcripts_info_field_json 11347 11348 # Transcripts info format 11349 if transcripts_info_format is None: 11350 transcripts_info_format = param.get("transcripts", {}).get( 11351 "transcripts_info_format", transcripts_info_format_default 11352 ) 11353 11354 # Transcripts info field FORMAT 11355 if transcripts_info_field_format is None: 11356 transcripts_info_field_format = param.get("transcripts", {}).get( 11357 "transcripts_info_field_format", transcripts_info_field_format_default 11358 ) 11359 # if ( 11360 # transcripts_info_field_format is not None 11361 # and transcripts_info_format is None 11362 # ): 11363 # transcripts_info_format = transcripts_info_field_format 11364 11365 # Variants table 11366 table_variants = self.get_table_variants() 11367 11368 # Check info columns param 11369 if ( 11370 transcripts_info_json is None 11371 and transcripts_info_field_json is None 11372 and transcripts_info_format is None 11373 and transcripts_info_field_format is None 11374 ): 11375 return False 11376 11377 # Transcripts infos columns 11378 query_transcripts_infos_columns = f""" 11379 SELECT * 11380 FROM ( 11381 DESCRIBE SELECT * FROM {transcripts_table} 11382 ) 11383 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11384 """ 11385 transcripts_infos_columns = list( 11386 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11387 ) 11388 11389 # View results 11390 clause_select = [] 11391 clause_to_json = [] 11392 clause_to_format = [] 11393 for field in transcripts_infos_columns: 11394 # Do not consider INFO field for export into fields 11395 if field not in ["INFO"]: 11396 clause_select.append( 11397 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11398 ) 11399 clause_to_json.append(f""" '{field}': "{field}" """) 11400 clause_to_format.append(f""" "{field}" """) 11401 11402 # Update 11403 update_set_json = [] 11404 update_set_format = [] 11405 11406 # VCF header 11407 vcf_reader = self.get_header() 11408 11409 # Transcripts to info column in JSON 11410 if transcripts_info_json: 11411 11412 # Create column on variants table 11413 self.add_column( 11414 table_name=table_variants, 11415 column_name=transcripts_info_json, 11416 column_type="JSON", 11417 default_value=None, 11418 drop=False, 11419 ) 11420 11421 # Add header 11422 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11423 transcripts_info_json, 11424 ".", 11425 "String", 11426 "Transcripts in JSON format", 11427 "unknwon", 11428 "unknwon", 11429 self.code_type_map["String"], 11430 ) 11431 11432 # Add to update 11433 update_set_json.append( 11434 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11435 ) 11436 11437 # Transcripts to info field in JSON 11438 if transcripts_info_field_json: 11439 11440 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11441 11442 # Add to update 11443 update_set_json.append( 11444 f""" 11445 INFO = concat( 11446 CASE 11447 WHEN INFO NOT IN ('', '.') 11448 THEN INFO 11449 ELSE '' 11450 END, 11451 CASE 11452 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11453 THEN concat( 11454 ';{transcripts_info_field_json}=', 11455 t.{transcripts_info_json} 11456 ) 11457 ELSE '' 11458 END 11459 ) 11460 """ 11461 ) 11462 11463 # Add header 11464 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11465 transcripts_info_field_json, 11466 ".", 11467 "String", 11468 "Transcripts in JSON format", 11469 "unknwon", 11470 "unknwon", 11471 self.code_type_map["String"], 11472 ) 11473 11474 if update_set_json: 11475 11476 # Update query 11477 query_update = f""" 11478 UPDATE {table_variants} 11479 SET {", ".join(update_set_json)} 11480 FROM 11481 ( 11482 SELECT 11483 "#CHROM", POS, REF, ALT, 11484 concat( 11485 '{{', 11486 string_agg( 11487 '"' || "{transcripts_column_id}" || '":' || 11488 to_json(json_output) 11489 ), 11490 '}}' 11491 )::JSON AS {transcripts_info_json} 11492 FROM 11493 ( 11494 SELECT 11495 "#CHROM", POS, REF, ALT, 11496 "{transcripts_column_id}", 11497 to_json( 11498 {{{",".join(clause_to_json)}}} 11499 )::JSON AS json_output 11500 FROM 11501 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11502 WHERE "{transcripts_column_id}" IS NOT NULL 11503 ) 11504 GROUP BY "#CHROM", POS, REF, ALT 11505 ) AS t 11506 WHERE {table_variants}."#CHROM" = t."#CHROM" 11507 AND {table_variants}."POS" = t."POS" 11508 AND {table_variants}."REF" = t."REF" 11509 AND {table_variants}."ALT" = t."ALT" 11510 """ 11511 11512 self.execute_query(query=query_update) 11513 11514 # Transcripts to info column in FORMAT 11515 if transcripts_info_format: 11516 11517 # Create column on variants table 11518 self.add_column( 11519 table_name=table_variants, 11520 column_name=transcripts_info_format, 11521 column_type="VARCHAR", 11522 default_value=None, 11523 drop=False, 11524 ) 11525 11526 # Add header 11527 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11528 transcripts_info_format, 11529 ".", 11530 "String", 11531 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11532 "unknwon", 11533 "unknwon", 11534 self.code_type_map["String"], 11535 ) 11536 11537 # Add to update 11538 update_set_format.append( 11539 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11540 ) 11541 11542 else: 11543 11544 # Set variable for internal queries 11545 transcripts_info_format = "transcripts_info_format" 11546 11547 # Transcripts to info field in JSON 11548 if transcripts_info_field_format: 11549 11550 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11551 11552 # Add to update 11553 update_set_format.append( 11554 f""" 11555 INFO = concat( 11556 CASE 11557 WHEN INFO NOT IN ('', '.') 11558 THEN INFO 11559 ELSE '' 11560 END, 11561 CASE 11562 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11563 THEN concat( 11564 ';{transcripts_info_field_format}=', 11565 t.{transcripts_info_format} 11566 ) 11567 ELSE '' 11568 END 11569 ) 11570 """ 11571 ) 11572 11573 # Add header 11574 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11575 transcripts_info_field_format, 11576 ".", 11577 "String", 11578 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11579 "unknwon", 11580 "unknwon", 11581 self.code_type_map["String"], 11582 ) 11583 11584 if update_set_format: 11585 11586 # Update query 11587 query_update = f""" 11588 UPDATE {table_variants} 11589 SET {", ".join(update_set_format)} 11590 FROM 11591 ( 11592 SELECT 11593 "#CHROM", POS, REF, ALT, 11594 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11595 FROM 11596 ( 11597 SELECT 11598 "#CHROM", POS, REF, ALT, 11599 "{transcripts_column_id}", 11600 concat( 11601 "{transcripts_column_id}", 11602 '|', 11603 {", '|', ".join(clause_to_format)} 11604 ) AS {transcripts_info_format} 11605 FROM 11606 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11607 ) 11608 GROUP BY "#CHROM", POS, REF, ALT 11609 ) AS t 11610 WHERE {table_variants}."#CHROM" = t."#CHROM" 11611 AND {table_variants}."POS" = t."POS" 11612 AND {table_variants}."REF" = t."REF" 11613 AND {table_variants}."ALT" = t."ALT" 11614 """ 11615 11616 self.execute_query(query=query_update) 11617 11618 return True
36class Variants: 37 38 def __init__( 39 self, 40 conn=None, 41 input: str = None, 42 output: str = None, 43 config: dict = {}, 44 param: dict = {}, 45 load: bool = False, 46 ) -> None: 47 """ 48 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 49 header 50 51 :param conn: the connection to the database 52 :param input: the input file 53 :param output: the output file 54 :param config: a dictionary containing the configuration of the model 55 :param param: a dictionary containing the parameters of the model 56 """ 57 58 # Init variables 59 self.init_variables() 60 61 # Input 62 self.set_input(input) 63 64 # Config 65 self.set_config(config) 66 67 # Param 68 self.set_param(param) 69 70 # Output 71 self.set_output(output) 72 73 # connexion 74 self.set_connexion(conn) 75 76 # Header 77 self.set_header() 78 79 # Samples 80 self.set_samples() 81 82 # Load data 83 if load: 84 self.load_data() 85 86 def set_samples(self, samples: list = None) -> list: 87 """ 88 The function `set_samples` sets the samples attribute of an object to a provided list or 89 retrieves it from a parameter dictionary. 90 91 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 92 input and sets the `samples` attribute of the class to the provided list. If no samples are 93 provided, it tries to get the samples from the class's parameters using the `get_param` method 94 :type samples: list 95 :return: The `samples` list is being returned. 96 """ 97 98 if not samples: 99 samples = self.get_param().get("samples", {}).get("list", None) 100 101 self.samples = samples 102 103 return samples 104 105 def get_samples(self) -> list: 106 """ 107 This function returns a list of samples. 108 :return: The `get_samples` method is returning the `samples` attribute of the object. 109 """ 110 111 return self.samples 112 113 def get_samples_check(self) -> bool: 114 """ 115 This function returns the value of the "check" key within the "samples" dictionary retrieved 116 from the parameters. 117 :return: The method `get_samples_check` is returning the value of the key "check" inside the 118 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 119 method. If the key "check" is not found, it will return `False`. 120 """ 121 122 return self.get_param().get("samples", {}).get("check", True) 123 124 def set_input(self, input: str = None) -> None: 125 """ 126 The function `set_input` takes a file name as input, extracts the name and extension, and sets 127 attributes in the class accordingly. 128 129 :param input: The `set_input` method in the provided code snippet is used to set attributes 130 related to the input file. Here's a breakdown of the parameters and their usage in the method: 131 :type input: str 132 """ 133 134 if input and not isinstance(input, str): 135 try: 136 self.input = input.name 137 except: 138 log.error(f"Input file '{input} in bad format") 139 raise ValueError(f"Input file '{input} in bad format") 140 else: 141 self.input = input 142 143 # Input format 144 if input: 145 input_name, input_extension = os.path.splitext(self.input) 146 self.input_name = input_name 147 self.input_extension = input_extension 148 self.input_format = self.input_extension.replace(".", "") 149 150 def set_config(self, config: dict) -> None: 151 """ 152 The set_config function takes a config object and assigns it as the configuration object for the 153 class. 154 155 :param config: The `config` parameter in the `set_config` function is a dictionary object that 156 contains configuration settings for the class. When you call the `set_config` function with a 157 dictionary object as the argument, it will set that dictionary as the configuration object for 158 the class 159 :type config: dict 160 """ 161 162 self.config = config 163 164 def set_param(self, param: dict) -> None: 165 """ 166 This function sets a parameter object for the class based on the input dictionary. 167 168 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 169 as the `param` attribute of the class instance 170 :type param: dict 171 """ 172 173 self.param = param 174 175 def init_variables(self) -> None: 176 """ 177 This function initializes the variables that will be used in the rest of the class 178 """ 179 180 self.prefix = "howard" 181 self.table_variants = "variants" 182 self.dataframe = None 183 184 self.comparison_map = { 185 "gt": ">", 186 "gte": ">=", 187 "lt": "<", 188 "lte": "<=", 189 "equals": "=", 190 "contains": "SIMILAR TO", 191 } 192 193 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 194 195 self.code_type_map_to_sql = { 196 "Integer": "INTEGER", 197 "String": "VARCHAR", 198 "Float": "FLOAT", 199 "Flag": "VARCHAR", 200 } 201 202 self.index_additionnal_fields = [] 203 204 def get_indexing(self) -> bool: 205 """ 206 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 207 returns False. 208 :return: The value of the indexing parameter. 209 """ 210 211 return self.get_param().get("indexing", False) 212 213 def get_connexion_config(self) -> dict: 214 """ 215 The function `get_connexion_config` returns a dictionary containing the configuration for a 216 connection, including the number of threads and memory limit. 217 :return: a dictionary containing the configuration for the Connexion library. 218 """ 219 220 # config 221 config = self.get_config() 222 223 # Connexion config 224 connexion_config = {} 225 threads = self.get_threads() 226 227 # Threads 228 if threads: 229 connexion_config["threads"] = threads 230 231 # Memory 232 # if config.get("memory", None): 233 # connexion_config["memory_limit"] = config.get("memory") 234 if self.get_memory(): 235 connexion_config["memory_limit"] = self.get_memory() 236 237 # Temporary directory 238 if config.get("tmp", None): 239 connexion_config["temp_directory"] = config.get("tmp") 240 241 # Access 242 if config.get("access", None): 243 access = config.get("access") 244 if access in ["RO"]: 245 access = "READ_ONLY" 246 elif access in ["RW"]: 247 access = "READ_WRITE" 248 connexion_db = self.get_connexion_db() 249 if connexion_db in ":memory:": 250 access = "READ_WRITE" 251 connexion_config["access_mode"] = access 252 253 return connexion_config 254 255 def get_duckdb_settings(self) -> dict: 256 """ 257 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 258 string. 259 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 260 """ 261 262 # config 263 config = self.get_config() 264 265 # duckdb settings 266 duckdb_settings_dict = {} 267 if config.get("duckdb_settings", None): 268 duckdb_settings = config.get("duckdb_settings") 269 duckdb_settings = full_path(duckdb_settings) 270 # duckdb setting is a file 271 if os.path.exists(duckdb_settings): 272 with open(duckdb_settings) as json_file: 273 duckdb_settings_dict = yaml.safe_load(json_file) 274 # duckdb settings is a string 275 else: 276 duckdb_settings_dict = json.loads(duckdb_settings) 277 278 return duckdb_settings_dict 279 280 def set_connexion_db(self) -> str: 281 """ 282 The function `set_connexion_db` returns the appropriate database connection string based on the 283 input format and connection type. 284 :return: the value of the variable `connexion_db`. 285 """ 286 287 # Default connexion db 288 default_connexion_db = ":memory:" 289 290 # Find connexion db 291 if self.get_input_format() in ["db", "duckdb"]: 292 connexion_db = self.get_input() 293 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 294 connexion_db = default_connexion_db 295 elif self.get_connexion_type() in ["tmpfile"]: 296 tmp_name = tempfile.mkdtemp( 297 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 298 ) 299 connexion_db = f"{tmp_name}/tmp.db" 300 elif self.get_connexion_type() != "": 301 connexion_db = self.get_connexion_type() 302 else: 303 connexion_db = default_connexion_db 304 305 # Set connexion db 306 self.connexion_db = connexion_db 307 308 return connexion_db 309 310 def set_connexion(self, conn) -> None: 311 """ 312 The function `set_connexion` creates a connection to a database, with options for different 313 database formats and settings. 314 315 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 316 database. If a connection is not provided, a new connection to an in-memory database is created. 317 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 318 sqlite 319 """ 320 321 # Connexion db 322 connexion_db = self.set_connexion_db() 323 324 # Connexion config 325 connexion_config = self.get_connexion_config() 326 327 # Connexion format 328 connexion_format = self.get_config().get("connexion_format", "duckdb") 329 # Set connexion format 330 self.connexion_format = connexion_format 331 332 # Connexion 333 if not conn: 334 if connexion_format in ["duckdb"]: 335 conn = duckdb.connect(connexion_db, config=connexion_config) 336 # duckDB settings 337 duckdb_settings = self.get_duckdb_settings() 338 if duckdb_settings: 339 for setting in duckdb_settings: 340 setting_value = duckdb_settings.get(setting) 341 if isinstance(setting_value, str): 342 setting_value = f"'{setting_value}'" 343 conn.execute(f"PRAGMA {setting}={setting_value};") 344 elif connexion_format in ["sqlite"]: 345 conn = sqlite3.connect(connexion_db) 346 347 # Set connexion 348 self.conn = conn 349 350 # Log 351 log.debug(f"connexion_format: {connexion_format}") 352 log.debug(f"connexion_db: {connexion_db}") 353 log.debug(f"connexion config: {connexion_config}") 354 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 355 356 def set_output(self, output: str = None) -> None: 357 """ 358 The `set_output` function in Python sets the output file based on the input or a specified key 359 in the config file, extracting the output name, extension, and format. 360 361 :param output: The `output` parameter in the `set_output` method is used to specify the name of 362 the output file. If the config file has an 'output' key, the method sets the output to the value 363 of that key. If no output is provided, it sets the output to `None` 364 :type output: str 365 """ 366 367 if output and not isinstance(output, str): 368 self.output = output.name 369 else: 370 self.output = output 371 372 # Output format 373 if self.output: 374 output_name, output_extension = os.path.splitext(self.output) 375 self.output_name = output_name 376 self.output_extension = output_extension 377 self.output_format = self.output_extension.replace(".", "") 378 else: 379 self.output_name = None 380 self.output_extension = None 381 self.output_format = None 382 383 def set_header(self) -> None: 384 """ 385 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 386 """ 387 388 input_file = self.get_input() 389 default_header_list = [ 390 "##fileformat=VCFv4.2", 391 "#CHROM POS ID REF ALT QUAL FILTER INFO", 392 ] 393 394 # Full path 395 input_file = full_path(input_file) 396 397 if input_file: 398 399 input_format = self.get_input_format() 400 input_compressed = self.get_input_compressed() 401 config = self.get_config() 402 header_list = default_header_list 403 if input_format in [ 404 "vcf", 405 "hdr", 406 "tsv", 407 "csv", 408 "psv", 409 "parquet", 410 "db", 411 "duckdb", 412 ]: 413 # header provided in param 414 if config.get("header_file", None): 415 with open(config.get("header_file"), "rt") as f: 416 header_list = self.read_vcf_header(f) 417 # within a vcf file format (header within input file itsself) 418 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 419 # within a compressed vcf file format (.vcf.gz) 420 if input_compressed: 421 with bgzf.open(input_file, "rt") as f: 422 header_list = self.read_vcf_header(f) 423 # within an uncompressed vcf file format (.vcf) 424 else: 425 with open(input_file, "rt") as f: 426 header_list = self.read_vcf_header(f) 427 # header provided in default external file .hdr 428 elif os.path.exists((input_file + ".hdr")): 429 with open(input_file + ".hdr", "rt") as f: 430 header_list = self.read_vcf_header(f) 431 else: 432 try: # Try to get header info fields and file columns 433 434 with tempfile.TemporaryDirectory() as tmpdir: 435 436 # Create database 437 db_for_header = Database(database=input_file) 438 439 # Get header columns for infos fields 440 db_header_from_columns = ( 441 db_for_header.get_header_from_columns() 442 ) 443 444 # Get real columns in the file 445 db_header_columns = db_for_header.get_columns() 446 447 # Write header file 448 header_file_tmp = os.path.join(tmpdir, "header") 449 f = open(header_file_tmp, "w") 450 vcf.Writer(f, db_header_from_columns) 451 f.close() 452 453 # Replace #CHROM line with rel columns 454 header_list = db_for_header.read_header_file( 455 header_file=header_file_tmp 456 ) 457 header_list[-1] = "\t".join(db_header_columns) 458 459 except: 460 461 log.warning( 462 f"No header for file {input_file}. Set as default VCF header" 463 ) 464 header_list = default_header_list 465 466 else: # try for unknown format ? 467 468 log.error(f"Input file format '{input_format}' not available") 469 raise ValueError(f"Input file format '{input_format}' not available") 470 471 if not header_list: 472 header_list = default_header_list 473 474 # header as list 475 self.header_list = header_list 476 477 # header as VCF object 478 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 479 480 else: 481 482 self.header_list = None 483 self.header_vcf = None 484 485 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 486 """ 487 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 488 DataFrame based on the connection format. 489 490 :param query: The `query` parameter in the `get_query_to_df` function is a string that 491 represents the SQL query you want to execute. This query will be used to fetch data from a 492 database and convert it into a pandas DataFrame 493 :type query: str 494 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 495 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 496 function will only fetch up to that number of rows from the database query result. If no limit 497 is specified, 498 :type limit: int 499 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 500 """ 501 502 # Connexion format 503 connexion_format = self.get_connexion_format() 504 505 # Limit in query 506 if limit: 507 pd.set_option("display.max_rows", limit) 508 if connexion_format in ["duckdb"]: 509 df = ( 510 self.conn.execute(query) 511 .fetch_record_batch(limit) 512 .read_next_batch() 513 .to_pandas() 514 ) 515 elif connexion_format in ["sqlite"]: 516 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 517 518 # Full query 519 else: 520 if connexion_format in ["duckdb"]: 521 df = self.conn.execute(query).df() 522 elif connexion_format in ["sqlite"]: 523 df = pd.read_sql_query(query, self.conn) 524 525 return df 526 527 def get_overview(self) -> None: 528 """ 529 The function prints the input, output, config, and dataframe of the current object 530 """ 531 table_variants_from = self.get_table_variants(clause="from") 532 sql_columns = self.get_header_columns_as_sql() 533 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 534 df = self.get_query_to_df(sql_query_export) 535 log.info( 536 "Input: " 537 + str(self.get_input()) 538 + " [" 539 + str(str(self.get_input_format())) 540 + "]" 541 ) 542 log.info( 543 "Output: " 544 + str(self.get_output()) 545 + " [" 546 + str(str(self.get_output_format())) 547 + "]" 548 ) 549 log.info("Config: ") 550 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 551 "\n" 552 ): 553 log.info("\t" + str(d)) 554 log.info("Param: ") 555 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 556 "\n" 557 ): 558 log.info("\t" + str(d)) 559 log.info("Sample list: " + str(self.get_header_sample_list())) 560 log.info("Dataframe: ") 561 for d in str(df).split("\n"): 562 log.info("\t" + str(d)) 563 564 # garbage collector 565 del df 566 gc.collect() 567 568 return None 569 570 def get_stats(self) -> dict: 571 """ 572 The `get_stats` function calculates and returns various statistics of the current object, 573 including information about the input file, variants, samples, header fields, quality, and 574 SNVs/InDels. 575 :return: a dictionary containing various statistics of the current object. The dictionary has 576 the following structure: 577 """ 578 579 # Log 580 log.info(f"Stats Calculation...") 581 582 # table varaints 583 table_variants_from = self.get_table_variants() 584 585 # stats dict 586 stats = {"Infos": {}} 587 588 ### File 589 input_file = self.get_input() 590 stats["Infos"]["Input file"] = input_file 591 592 # Header 593 header_infos = self.get_header().infos 594 header_formats = self.get_header().formats 595 header_infos_list = list(header_infos) 596 header_formats_list = list(header_formats) 597 598 ### Variants 599 600 stats["Variants"] = {} 601 602 # Variants by chr 603 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 604 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 605 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 606 by=["CHROM"], kind="quicksort" 607 ) 608 609 # Total number of variants 610 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 611 612 # Calculate percentage 613 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 614 lambda x: (x / nb_of_variants) 615 ) 616 617 stats["Variants"]["Number of variants by chromosome"] = ( 618 nb_of_variants_by_chrom.to_dict(orient="index") 619 ) 620 621 stats["Infos"]["Number of variants"] = int(nb_of_variants) 622 623 ### Samples 624 625 # Init 626 samples = {} 627 nb_of_samples = 0 628 629 # Check Samples 630 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 631 log.debug(f"Check samples...") 632 for sample in self.get_header_sample_list(): 633 sql_query_samples = f""" 634 SELECT '{sample}' as sample, 635 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 636 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 637 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 638 FROM {table_variants_from} 639 WHERE ( 640 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 641 AND 642 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 643 ) 644 GROUP BY genotype 645 """ 646 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 647 sample_genotype_count = sql_query_genotype_df["count"].sum() 648 if len(sql_query_genotype_df): 649 nb_of_samples += 1 650 samples[f"{sample} - {sample_genotype_count} variants"] = ( 651 sql_query_genotype_df.to_dict(orient="index") 652 ) 653 654 stats["Samples"] = samples 655 stats["Infos"]["Number of samples"] = nb_of_samples 656 657 # # 658 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 659 # stats["Infos"]["Number of samples"] = nb_of_samples 660 # elif nb_of_samples: 661 # stats["Infos"]["Number of samples"] = "not a VCF format" 662 663 ### INFO and FORMAT fields 664 header_types_df = {} 665 header_types_list = { 666 "List of INFO fields": header_infos, 667 "List of FORMAT fields": header_formats, 668 } 669 i = 0 670 for header_type in header_types_list: 671 672 header_type_infos = header_types_list.get(header_type) 673 header_infos_dict = {} 674 675 for info in header_type_infos: 676 677 i += 1 678 header_infos_dict[i] = {} 679 680 # ID 681 header_infos_dict[i]["id"] = info 682 683 # num 684 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 685 if header_type_infos[info].num in genotype_map.keys(): 686 header_infos_dict[i]["Number"] = genotype_map.get( 687 header_type_infos[info].num 688 ) 689 else: 690 header_infos_dict[i]["Number"] = header_type_infos[info].num 691 692 # type 693 if header_type_infos[info].type: 694 header_infos_dict[i]["Type"] = header_type_infos[info].type 695 else: 696 header_infos_dict[i]["Type"] = "." 697 698 # desc 699 if header_type_infos[info].desc != None: 700 header_infos_dict[i]["Description"] = header_type_infos[info].desc 701 else: 702 header_infos_dict[i]["Description"] = "" 703 704 if len(header_infos_dict): 705 header_types_df[header_type] = pd.DataFrame.from_dict( 706 header_infos_dict, orient="index" 707 ).to_dict(orient="index") 708 709 # Stats 710 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 711 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 712 stats["Header"] = header_types_df 713 714 ### QUAL 715 if "QUAL" in self.get_header_columns(): 716 sql_query_qual = f""" 717 SELECT 718 avg(CAST(QUAL AS INTEGER)) AS Average, 719 min(CAST(QUAL AS INTEGER)) AS Minimum, 720 max(CAST(QUAL AS INTEGER)) AS Maximum, 721 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 722 median(CAST(QUAL AS INTEGER)) AS Median, 723 variance(CAST(QUAL AS INTEGER)) AS Variance 724 FROM {table_variants_from} 725 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 726 """ 727 728 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 729 stats["Quality"] = {"Stats": qual} 730 731 ### SNV and InDel 732 733 sql_query_snv = f""" 734 735 SELECT Type, count FROM ( 736 737 SELECT 738 'Total' AS Type, 739 count(*) AS count 740 FROM {table_variants_from} 741 742 UNION 743 744 SELECT 745 'MNV' AS Type, 746 count(*) AS count 747 FROM {table_variants_from} 748 WHERE len(REF) > 1 AND len(ALT) > 1 749 AND len(REF) = len(ALT) 750 751 UNION 752 753 SELECT 754 'InDel' AS Type, 755 count(*) AS count 756 FROM {table_variants_from} 757 WHERE len(REF) > 1 OR len(ALT) > 1 758 AND len(REF) != len(ALT) 759 760 UNION 761 762 SELECT 763 'SNV' AS Type, 764 count(*) AS count 765 FROM {table_variants_from} 766 WHERE len(REF) = 1 AND len(ALT) = 1 767 768 ) 769 770 ORDER BY count DESC 771 772 """ 773 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 774 775 sql_query_snv_substitution = f""" 776 SELECT 777 concat(REF, '>', ALT) AS 'Substitution', 778 count(*) AS count 779 FROM {table_variants_from} 780 WHERE len(REF) = 1 AND len(ALT) = 1 781 GROUP BY REF, ALT 782 ORDER BY count(*) DESC 783 """ 784 snv_substitution = ( 785 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 786 ) 787 stats["Variants"]["Counts"] = snv_indel 788 stats["Variants"]["Substitutions"] = snv_substitution 789 790 return stats 791 792 def stats_to_file(self, file: str = None) -> str: 793 """ 794 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 795 into a JSON object, and writes the JSON object to the specified file. 796 797 :param file: The `file` parameter is a string that represents the file path where the JSON data 798 will be written 799 :type file: str 800 :return: the name of the file that was written to. 801 """ 802 803 # Get stats 804 stats = self.get_stats() 805 806 # Serializing json 807 json_object = json.dumps(stats, indent=4) 808 809 # Writing to sample.json 810 with open(file, "w") as outfile: 811 outfile.write(json_object) 812 813 return file 814 815 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 816 """ 817 The `print_stats` function generates a markdown file and prints the statistics contained in a 818 JSON file in a formatted manner. 819 820 :param output_file: The `output_file` parameter is a string that specifies the path and filename 821 of the output file where the stats will be printed in Markdown format. If no `output_file` is 822 provided, a temporary directory will be created and the stats will be saved in a file named 823 "stats.md" within that 824 :type output_file: str 825 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 826 file where the statistics will be saved. If no value is provided, a temporary directory will be 827 created and a default file name "stats.json" will be used 828 :type json_file: str 829 :return: The function `print_stats` does not return any value. It has a return type annotation 830 of `None`. 831 """ 832 833 # Full path 834 output_file = full_path(output_file) 835 json_file = full_path(json_file) 836 837 with tempfile.TemporaryDirectory() as tmpdir: 838 839 # Files 840 if not output_file: 841 output_file = os.path.join(tmpdir, "stats.md") 842 if not json_file: 843 json_file = os.path.join(tmpdir, "stats.json") 844 845 # Create folders 846 if not os.path.exists(os.path.dirname(output_file)): 847 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 848 if not os.path.exists(os.path.dirname(json_file)): 849 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 850 851 # Create stats JSON file 852 stats_file = self.stats_to_file(file=json_file) 853 854 # Print stats file 855 with open(stats_file) as f: 856 stats = yaml.safe_load(f) 857 858 # Output 859 output_title = [] 860 output_index = [] 861 output = [] 862 863 # Title 864 output_title.append("# HOWARD Stats") 865 866 # Index 867 output_index.append("## Index") 868 869 # Process sections 870 for section in stats: 871 infos = stats.get(section) 872 section_link = "#" + section.lower().replace(" ", "-") 873 output.append(f"## {section}") 874 output_index.append(f"- [{section}]({section_link})") 875 876 if len(infos): 877 for info in infos: 878 try: 879 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 880 is_df = True 881 except: 882 try: 883 df = pd.DataFrame.from_dict( 884 json.loads((infos.get(info))), orient="index" 885 ) 886 is_df = True 887 except: 888 is_df = False 889 if is_df: 890 output.append(f"### {info}") 891 info_link = "#" + info.lower().replace(" ", "-") 892 output_index.append(f" - [{info}]({info_link})") 893 output.append(f"{df.to_markdown(index=False)}") 894 else: 895 output.append(f"- {info}: {infos.get(info)}") 896 else: 897 output.append(f"NA") 898 899 # Write stats in markdown file 900 with open(output_file, "w") as fp: 901 for item in output_title: 902 fp.write("%s\n" % item) 903 for item in output_index: 904 fp.write("%s\n" % item) 905 for item in output: 906 fp.write("%s\n" % item) 907 908 # Output stats in markdown 909 print("") 910 print("\n\n".join(output_title)) 911 print("") 912 print("\n\n".join(output)) 913 print("") 914 915 return None 916 917 def get_input(self) -> str: 918 """ 919 It returns the value of the input variable. 920 :return: The input is being returned. 921 """ 922 return self.input 923 924 def get_input_format(self, input_file: str = None) -> str: 925 """ 926 This function returns the format of the input variable, either from the provided input file or 927 by prompting for input. 928 929 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 930 represents the file path of the input file. If no `input_file` is provided when calling the 931 method, it will default to `None` 932 :type input_file: str 933 :return: The format of the input variable is being returned. 934 """ 935 936 if not input_file: 937 input_file = self.get_input() 938 input_format = get_file_format(input_file) 939 return input_format 940 941 def get_input_compressed(self, input_file: str = None) -> str: 942 """ 943 The function `get_input_compressed` returns the format of the input variable after compressing 944 it. 945 946 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 947 that represents the file path of the input file. If no `input_file` is provided when calling the 948 method, it will default to `None` and the method will then call `self.get_input()` to 949 :type input_file: str 950 :return: The function `get_input_compressed` returns the compressed format of the input 951 variable. 952 """ 953 954 if not input_file: 955 input_file = self.get_input() 956 input_compressed = get_file_compressed(input_file) 957 return input_compressed 958 959 def get_output(self) -> str: 960 """ 961 It returns the output of the neuron. 962 :return: The output of the neural network. 963 """ 964 965 return self.output 966 967 def get_output_format(self, output_file: str = None) -> str: 968 """ 969 The function `get_output_format` returns the format of the input variable or the output file if 970 provided. 971 972 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 973 that represents the file path of the output file. If no `output_file` is provided when calling 974 the method, it will default to the output obtained from the `get_output` method of the class 975 instance. The 976 :type output_file: str 977 :return: The format of the input variable is being returned. 978 """ 979 980 if not output_file: 981 output_file = self.get_output() 982 output_format = get_file_format(output_file) 983 984 return output_format 985 986 def get_config(self) -> dict: 987 """ 988 It returns the config 989 :return: The config variable is being returned. 990 """ 991 return self.config 992 993 def get_param(self) -> dict: 994 """ 995 It returns the param 996 :return: The param variable is being returned. 997 """ 998 return self.param 999 1000 def get_connexion_db(self) -> str: 1001 """ 1002 It returns the connexion_db attribute of the object 1003 :return: The connexion_db is being returned. 1004 """ 1005 return self.connexion_db 1006 1007 def get_prefix(self) -> str: 1008 """ 1009 It returns the prefix of the object. 1010 :return: The prefix is being returned. 1011 """ 1012 return self.prefix 1013 1014 def get_table_variants(self, clause: str = "select") -> str: 1015 """ 1016 This function returns the table_variants attribute of the object 1017 1018 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1019 defaults to select (optional) 1020 :return: The table_variants attribute of the object. 1021 """ 1022 1023 # Access 1024 access = self.get_config().get("access", None) 1025 1026 # Clauses "select", "where", "update" 1027 if clause in ["select", "where", "update"]: 1028 table_variants = self.table_variants 1029 # Clause "from" 1030 elif clause in ["from"]: 1031 # For Read Only 1032 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1033 input_file = self.get_input() 1034 table_variants = f"'{input_file}' as variants" 1035 # For Read Write 1036 else: 1037 table_variants = f"{self.table_variants} as variants" 1038 else: 1039 table_variants = self.table_variants 1040 return table_variants 1041 1042 def get_tmp_dir(self) -> str: 1043 """ 1044 The function `get_tmp_dir` returns the temporary directory path based on configuration 1045 parameters or a default path. 1046 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1047 configuration, parameters, and a default value of "/tmp". 1048 """ 1049 1050 return get_tmp( 1051 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1052 ) 1053 1054 def get_connexion_type(self) -> str: 1055 """ 1056 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1057 1058 :return: The connexion type is being returned. 1059 """ 1060 return self.get_config().get("connexion_type", "memory") 1061 1062 def get_connexion(self): 1063 """ 1064 It returns the connection object 1065 1066 :return: The connection object. 1067 """ 1068 return self.conn 1069 1070 def close_connexion(self) -> None: 1071 """ 1072 This function closes the connection to the database. 1073 :return: The connection is being closed. 1074 """ 1075 return self.conn.close() 1076 1077 def get_header(self, type: str = "vcf"): 1078 """ 1079 This function returns the header of the VCF file as a list of strings 1080 1081 :param type: the type of header you want to get, defaults to vcf (optional) 1082 :return: The header of the vcf file. 1083 """ 1084 1085 if self.header_vcf: 1086 if type == "vcf": 1087 return self.header_vcf 1088 elif type == "list": 1089 return self.header_list 1090 else: 1091 if type == "vcf": 1092 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1093 return header 1094 elif type == "list": 1095 return vcf_required 1096 1097 def get_header_infos_list(self) -> list: 1098 """ 1099 This function retrieves a list of information fields from the header. 1100 :return: A list of information fields from the header. 1101 """ 1102 1103 # Init 1104 infos_list = [] 1105 1106 for field in self.get_header().infos: 1107 infos_list.append(field) 1108 1109 return infos_list 1110 1111 def get_header_length(self, file: str = None) -> int: 1112 """ 1113 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1114 line. 1115 1116 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1117 header file. If this argument is provided, the function will read the header from the specified 1118 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1119 :type file: str 1120 :return: the length of the header list, excluding the #CHROM line. 1121 """ 1122 1123 if file: 1124 return len(self.read_vcf_header_file(file=file)) - 1 1125 elif self.get_header(type="list"): 1126 return len(self.get_header(type="list")) - 1 1127 else: 1128 return 0 1129 1130 def get_header_columns(self) -> str: 1131 """ 1132 This function returns the header list of a VCF 1133 1134 :return: The length of the header list. 1135 """ 1136 if self.get_header(): 1137 return self.get_header(type="list")[-1] 1138 else: 1139 return "" 1140 1141 def get_header_columns_as_list(self) -> list: 1142 """ 1143 This function returns the header list of a VCF 1144 1145 :return: The length of the header list. 1146 """ 1147 if self.get_header(): 1148 return self.get_header_columns().strip().split("\t") 1149 else: 1150 return [] 1151 1152 def get_header_columns_as_sql(self) -> str: 1153 """ 1154 This function retruns header length (without #CHROM line) 1155 1156 :return: The length of the header list. 1157 """ 1158 sql_column_list = [] 1159 for col in self.get_header_columns_as_list(): 1160 sql_column_list.append(f'"{col}"') 1161 return ",".join(sql_column_list) 1162 1163 def get_header_sample_list( 1164 self, check: bool = False, samples: list = None, samples_force: bool = False 1165 ) -> list: 1166 """ 1167 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1168 checking and filtering based on input parameters. 1169 1170 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1171 parameter that determines whether to check if the samples in the list are properly defined as 1172 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1173 list is defined as a, defaults to False 1174 :type check: bool (optional) 1175 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1176 allows you to specify a subset of samples from the header. If you provide a list of sample 1177 names, the function will check if each sample is defined in the header. If a sample is not found 1178 in the 1179 :type samples: list 1180 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1181 a boolean parameter that determines whether to force the function to return the sample list 1182 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1183 function will return the sample list without performing, defaults to False 1184 :type samples_force: bool (optional) 1185 :return: The function `get_header_sample_list` returns a list of samples based on the input 1186 parameters and conditions specified in the function. 1187 """ 1188 1189 # Init 1190 samples_list = [] 1191 1192 if samples is None: 1193 samples_list = self.header_vcf.samples 1194 else: 1195 samples_checked = [] 1196 for sample in samples: 1197 if sample in self.header_vcf.samples: 1198 samples_checked.append(sample) 1199 else: 1200 log.warning(f"Sample '{sample}' not defined in header") 1201 samples_list = samples_checked 1202 1203 # Force sample list without checking if is_genotype_column 1204 if samples_force: 1205 log.warning(f"Samples {samples_list} not checked if genotypes") 1206 return samples_list 1207 1208 if check: 1209 samples_checked = [] 1210 for sample in samples_list: 1211 if self.is_genotype_column(column=sample): 1212 samples_checked.append(sample) 1213 else: 1214 log.warning( 1215 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1216 ) 1217 samples_list = samples_checked 1218 1219 # Return samples list 1220 return samples_list 1221 1222 def is_genotype_column(self, column: str = None) -> bool: 1223 """ 1224 This function checks if a given column is a genotype column in a database. 1225 1226 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1227 represents the column name in a database table. This method checks if the specified column is a 1228 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1229 method of 1230 :type column: str 1231 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1232 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1233 column name and returns the result. If the `column` parameter is None, it returns False. 1234 """ 1235 1236 if column is not None: 1237 return Database(database=self.get_input()).is_genotype_column(column=column) 1238 else: 1239 return False 1240 1241 def get_verbose(self) -> bool: 1242 """ 1243 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1244 exist 1245 1246 :return: The value of the key "verbose" in the config dictionary. 1247 """ 1248 return self.get_config().get("verbose", False) 1249 1250 def get_connexion_format(self) -> str: 1251 """ 1252 It returns the connexion format of the object. 1253 :return: The connexion_format is being returned. 1254 """ 1255 connexion_format = self.connexion_format 1256 if connexion_format not in ["duckdb", "sqlite"]: 1257 log.error(f"Unknown connexion format {connexion_format}") 1258 raise ValueError(f"Unknown connexion format {connexion_format}") 1259 else: 1260 return connexion_format 1261 1262 def insert_file_to_table( 1263 self, 1264 file, 1265 columns: str, 1266 header_len: int = 0, 1267 sep: str = "\t", 1268 chunksize: int = 1000000, 1269 ) -> None: 1270 """ 1271 The function reads a file in chunks and inserts each chunk into a table based on the specified 1272 database format. 1273 1274 :param file: The `file` parameter is the file that you want to load into a table. It should be 1275 the path to the file on your system 1276 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1277 should contain the names of the columns in the table where the data will be inserted. The column 1278 names should be separated by commas within the string. For example, if you have columns named 1279 "id", "name 1280 :type columns: str 1281 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1282 the number of lines to skip at the beginning of the file before reading the actual data. This 1283 parameter allows you to skip any header information present in the file before processing the 1284 data, defaults to 0 1285 :type header_len: int (optional) 1286 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1287 separator character that is used in the file being read. In this case, the default separator is 1288 set to `\t`, which represents a tab character. You can change this parameter to a different 1289 separator character if, defaults to \t 1290 :type sep: str (optional) 1291 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1292 when processing the file in chunks. In the provided code snippet, the default value for 1293 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1294 to 1000000 1295 :type chunksize: int (optional) 1296 """ 1297 1298 # Config 1299 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1300 connexion_format = self.get_connexion_format() 1301 1302 log.debug("chunksize: " + str(chunksize)) 1303 1304 if chunksize: 1305 for chunk in pd.read_csv( 1306 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1307 ): 1308 if connexion_format in ["duckdb"]: 1309 sql_insert_into = ( 1310 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1311 ) 1312 self.conn.execute(sql_insert_into) 1313 elif connexion_format in ["sqlite"]: 1314 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1315 1316 def load_data( 1317 self, 1318 input_file: str = None, 1319 drop_variants_table: bool = False, 1320 sample_size: int = 20480, 1321 ) -> None: 1322 """ 1323 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1324 table before loading the data and specify a sample size. 1325 1326 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1327 table 1328 :type input_file: str 1329 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1330 determines whether the variants table should be dropped before loading the data. If set to 1331 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1332 not be dropped, defaults to False 1333 :type drop_variants_table: bool (optional) 1334 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1335 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1336 20480 1337 :type sample_size: int (optional) 1338 """ 1339 1340 log.info("Loading...") 1341 1342 # change input file 1343 if input_file: 1344 self.set_input(input_file) 1345 self.set_header() 1346 1347 # drop variants table 1348 if drop_variants_table: 1349 self.drop_variants_table() 1350 1351 # get table variants 1352 table_variants = self.get_table_variants() 1353 1354 # Access 1355 access = self.get_config().get("access", None) 1356 log.debug(f"access: {access}") 1357 1358 # Input format and compress 1359 input_format = self.get_input_format() 1360 input_compressed = self.get_input_compressed() 1361 log.debug(f"input_format: {input_format}") 1362 log.debug(f"input_compressed: {input_compressed}") 1363 1364 # input_compressed_format 1365 if input_compressed: 1366 input_compressed_format = "gzip" 1367 else: 1368 input_compressed_format = "none" 1369 log.debug(f"input_compressed_format: {input_compressed_format}") 1370 1371 # Connexion format 1372 connexion_format = self.get_connexion_format() 1373 1374 # Sample size 1375 if not sample_size: 1376 sample_size = -1 1377 log.debug(f"sample_size: {sample_size}") 1378 1379 # Load data 1380 log.debug(f"Load Data from {input_format}") 1381 1382 # DuckDB connexion 1383 if connexion_format in ["duckdb"]: 1384 1385 # Database already exists 1386 if self.input_format in ["db", "duckdb"]: 1387 1388 if connexion_format in ["duckdb"]: 1389 log.debug(f"Input file format '{self.input_format}' duckDB") 1390 else: 1391 log.error( 1392 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1393 ) 1394 raise ValueError( 1395 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1396 ) 1397 1398 # Load from existing database format 1399 else: 1400 1401 try: 1402 # Create Table or View 1403 database = Database(database=self.input) 1404 sql_from = database.get_sql_from(sample_size=sample_size) 1405 1406 if access in ["RO"]: 1407 sql_load = ( 1408 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1409 ) 1410 else: 1411 sql_load = ( 1412 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1413 ) 1414 self.conn.execute(sql_load) 1415 1416 except: 1417 # Format not available 1418 log.error(f"Input file format '{self.input_format}' not available") 1419 raise ValueError( 1420 f"Input file format '{self.input_format}' not available" 1421 ) 1422 1423 # SQLite connexion 1424 elif connexion_format in ["sqlite"] and input_format in [ 1425 "vcf", 1426 "tsv", 1427 "csv", 1428 "psv", 1429 ]: 1430 1431 # Main structure 1432 structure = { 1433 "#CHROM": "VARCHAR", 1434 "POS": "INTEGER", 1435 "ID": "VARCHAR", 1436 "REF": "VARCHAR", 1437 "ALT": "VARCHAR", 1438 "QUAL": "VARCHAR", 1439 "FILTER": "VARCHAR", 1440 "INFO": "VARCHAR", 1441 } 1442 1443 # Strcuture with samples 1444 structure_complete = structure 1445 if self.get_header_sample_list(): 1446 structure["FORMAT"] = "VARCHAR" 1447 for sample in self.get_header_sample_list(): 1448 structure_complete[sample] = "VARCHAR" 1449 1450 # Columns list for create and insert 1451 sql_create_table_columns = [] 1452 sql_create_table_columns_list = [] 1453 for column in structure_complete: 1454 column_type = structure_complete[column] 1455 sql_create_table_columns.append( 1456 f'"{column}" {column_type} default NULL' 1457 ) 1458 sql_create_table_columns_list.append(f'"{column}"') 1459 1460 # Create database 1461 log.debug(f"Create Table {table_variants}") 1462 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1463 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1464 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1465 self.conn.execute(sql_create_table) 1466 1467 # chunksize define length of file chunk load file 1468 chunksize = 100000 1469 1470 # delimiter 1471 delimiter = file_format_delimiters.get(input_format, "\t") 1472 1473 # Load the input file 1474 with open(self.input, "rt") as input_file: 1475 1476 # Use the appropriate file handler based on the input format 1477 if input_compressed: 1478 input_file = bgzf.open(self.input, "rt") 1479 if input_format in ["vcf"]: 1480 header_len = self.get_header_length() 1481 else: 1482 header_len = 0 1483 1484 # Insert the file contents into a table 1485 self.insert_file_to_table( 1486 input_file, 1487 columns=sql_create_table_columns_list_sql, 1488 header_len=header_len, 1489 sep=delimiter, 1490 chunksize=chunksize, 1491 ) 1492 1493 else: 1494 log.error( 1495 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1496 ) 1497 raise ValueError( 1498 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1499 ) 1500 1501 # Explode INFOS fields into table fields 1502 if self.get_explode_infos(): 1503 self.explode_infos( 1504 prefix=self.get_explode_infos_prefix(), 1505 fields=self.get_explode_infos_fields(), 1506 force=True, 1507 ) 1508 1509 # Create index after insertion 1510 self.create_indexes() 1511 1512 def get_explode_infos(self) -> bool: 1513 """ 1514 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1515 to False if it is not set. 1516 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1517 value. If the parameter is not present, it will return False. 1518 """ 1519 1520 return self.get_param().get("explode", {}).get("explode_infos", False) 1521 1522 def get_explode_infos_fields( 1523 self, 1524 explode_infos_fields: str = None, 1525 remove_fields_not_in_header: bool = False, 1526 ) -> list: 1527 """ 1528 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1529 the input parameter `explode_infos_fields`. 1530 1531 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1532 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1533 comma-separated list of field names to explode 1534 :type explode_infos_fields: str 1535 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1536 flag that determines whether to remove fields that are not present in the header. If it is set 1537 to `True`, any field that is not in the header will be excluded from the list of exploded 1538 information fields. If it is set to `, defaults to False 1539 :type remove_fields_not_in_header: bool (optional) 1540 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1541 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1542 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1543 Otherwise, it returns a list of exploded information fields after removing any spaces and 1544 splitting the string by commas. 1545 """ 1546 1547 # If no fields, get it in param 1548 if not explode_infos_fields: 1549 explode_infos_fields = ( 1550 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1551 ) 1552 1553 # If no fields, defined as all fields in header using keyword 1554 if not explode_infos_fields: 1555 explode_infos_fields = "*" 1556 1557 # If fields list not empty 1558 if explode_infos_fields: 1559 1560 # Input fields list 1561 if isinstance(explode_infos_fields, str): 1562 fields_input = explode_infos_fields.split(",") 1563 elif isinstance(explode_infos_fields, list): 1564 fields_input = explode_infos_fields 1565 else: 1566 fields_input = [] 1567 1568 # Fields list without * keyword 1569 fields_without_all = fields_input.copy() 1570 if "*".casefold() in (item.casefold() for item in fields_without_all): 1571 fields_without_all.remove("*") 1572 1573 # Fields in header 1574 fields_in_header = sorted(list(set(self.get_header().infos))) 1575 1576 # Construct list of fields 1577 fields_output = [] 1578 for field in fields_input: 1579 1580 # Strip field 1581 field = field.strip() 1582 1583 # format keyword * in regex 1584 if field.upper() in ["*"]: 1585 field = ".*" 1586 1587 # Find all fields with pattern 1588 r = re.compile(field) 1589 fields_search = sorted(list(filter(r.match, fields_in_header))) 1590 1591 # Remove fields input from search 1592 if field in fields_search: 1593 fields_search = [field] 1594 elif fields_search != [field]: 1595 fields_search = sorted( 1596 list(set(fields_search).difference(fields_input)) 1597 ) 1598 1599 # If field is not in header (avoid not well formatted header) 1600 if not fields_search and not remove_fields_not_in_header: 1601 fields_search = [field] 1602 1603 # Add found fields 1604 for new_field in fields_search: 1605 # Add field, if not already exists, and if it is in header (if asked) 1606 if ( 1607 new_field not in fields_output 1608 and ( 1609 not remove_fields_not_in_header 1610 or new_field in fields_in_header 1611 ) 1612 and new_field not in [".*"] 1613 ): 1614 fields_output.append(new_field) 1615 1616 return fields_output 1617 1618 else: 1619 1620 return [] 1621 1622 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1623 """ 1624 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1625 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1626 not provided. 1627 1628 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1629 prefix to be used for exploding or expanding information 1630 :type explode_infos_prefix: str 1631 :return: the value of the variable `explode_infos_prefix`. 1632 """ 1633 1634 if not explode_infos_prefix: 1635 explode_infos_prefix = ( 1636 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1637 ) 1638 1639 return explode_infos_prefix 1640 1641 def add_column( 1642 self, 1643 table_name, 1644 column_name, 1645 column_type, 1646 default_value=None, 1647 drop: bool = False, 1648 ) -> dict: 1649 """ 1650 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1651 doesn't already exist. 1652 1653 :param table_name: The name of the table to which you want to add a column 1654 :param column_name: The parameter "column_name" is the name of the column that you want to add 1655 to the table 1656 :param column_type: The `column_type` parameter specifies the data type of the column that you 1657 want to add to the table. It should be a string that represents the desired data type, such as 1658 "INTEGER", "TEXT", "REAL", etc 1659 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1660 default value for the newly added column. If a default value is provided, it will be assigned to 1661 the column for any existing rows that do not have a value for that column 1662 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1663 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1664 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1665 to False 1666 :type drop: bool (optional) 1667 :return: a boolean value indicating whether the column was successfully added to the table. 1668 """ 1669 1670 # added 1671 added = False 1672 dropped = False 1673 1674 # Check if the column already exists in the table 1675 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1676 columns = self.get_query_to_df(query).columns.tolist() 1677 if column_name.upper() in [c.upper() for c in columns]: 1678 log.debug( 1679 f"The {column_name} column already exists in the {table_name} table" 1680 ) 1681 if drop: 1682 self.drop_column(table_name=table_name, column_name=column_name) 1683 dropped = True 1684 else: 1685 return None 1686 else: 1687 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1688 1689 # Add column in table 1690 add_column_query = ( 1691 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1692 ) 1693 if default_value is not None: 1694 add_column_query += f" DEFAULT {default_value}" 1695 self.execute_query(add_column_query) 1696 added = not dropped 1697 log.debug( 1698 f"The {column_name} column was successfully added to the {table_name} table" 1699 ) 1700 1701 if added: 1702 added_column = { 1703 "table_name": table_name, 1704 "column_name": column_name, 1705 "column_type": column_type, 1706 "default_value": default_value, 1707 } 1708 else: 1709 added_column = None 1710 1711 return added_column 1712 1713 def drop_column( 1714 self, column: dict = None, table_name: str = None, column_name: str = None 1715 ) -> bool: 1716 """ 1717 The `drop_column` function drops a specified column from a given table in a database and returns 1718 True if the column was successfully dropped, and False if the column does not exist in the 1719 table. 1720 1721 :param column: The `column` parameter is a dictionary that contains information about the column 1722 you want to drop. It has two keys: 1723 :type column: dict 1724 :param table_name: The `table_name` parameter is the name of the table from which you want to 1725 drop a column 1726 :type table_name: str 1727 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1728 from the table 1729 :type column_name: str 1730 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1731 and False if the column does not exist in the table. 1732 """ 1733 1734 # Find column infos 1735 if column: 1736 if isinstance(column, dict): 1737 table_name = column.get("table_name", None) 1738 column_name = column.get("column_name", None) 1739 elif isinstance(column, str): 1740 table_name = self.get_table_variants() 1741 column_name = column 1742 else: 1743 table_name = None 1744 column_name = None 1745 1746 if not table_name and not column_name: 1747 return False 1748 1749 # Removed 1750 removed = False 1751 1752 # Check if the column already exists in the table 1753 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1754 columns = self.get_query_to_df(query).columns.tolist() 1755 if column_name in columns: 1756 log.debug(f"The {column_name} column exists in the {table_name} table") 1757 else: 1758 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1759 return False 1760 1761 # Add column in table # ALTER TABLE integers DROP k 1762 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1763 self.execute_query(add_column_query) 1764 removed = True 1765 log.debug( 1766 f"The {column_name} column was successfully dropped to the {table_name} table" 1767 ) 1768 1769 return removed 1770 1771 def explode_infos( 1772 self, 1773 prefix: str = None, 1774 create_index: bool = False, 1775 fields: list = None, 1776 force: bool = False, 1777 proccess_all_fields_together: bool = False, 1778 table: str = None, 1779 ) -> list: 1780 """ 1781 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1782 individual columns, returning a list of added columns. 1783 1784 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1785 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1786 `self.get_explode_infos_prefix()` as the prefix 1787 :type prefix: str 1788 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1789 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1790 `False`, indexes will not be created. The default value is `False`, defaults to False 1791 :type create_index: bool (optional) 1792 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1793 that you want to explode into individual columns. If this parameter is not provided, all INFO 1794 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1795 a list to the ` 1796 :type fields: list 1797 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1798 determines whether to drop and recreate a column if it already exists in the table. If `force` 1799 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1800 defaults to False 1801 :type force: bool (optional) 1802 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1803 flag that determines whether to process all the INFO fields together or individually. If set to 1804 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1805 be processed individually. The default value is, defaults to False 1806 :type proccess_all_fields_together: bool (optional) 1807 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1808 of the table where the exploded INFO fields will be added as individual columns. If you provide 1809 a value for the `table` parameter, the function will use that table name. If the `table` 1810 parameter is 1811 :type table: str 1812 :return: The `explode_infos` function returns a list of added columns. 1813 """ 1814 1815 # drop indexes 1816 self.drop_indexes() 1817 1818 # connexion format 1819 connexion_format = self.get_connexion_format() 1820 1821 # Access 1822 access = self.get_config().get("access", None) 1823 1824 # Added columns 1825 added_columns = [] 1826 1827 if access not in ["RO"]: 1828 1829 # prefix 1830 if prefix in [None, True] or not isinstance(prefix, str): 1831 if self.get_explode_infos_prefix() not in [None, True]: 1832 prefix = self.get_explode_infos_prefix() 1833 else: 1834 prefix = "INFO/" 1835 1836 # table variants 1837 if table is not None: 1838 table_variants = table 1839 else: 1840 table_variants = self.get_table_variants(clause="select") 1841 1842 # extra infos 1843 try: 1844 extra_infos = self.get_extra_infos() 1845 except: 1846 extra_infos = [] 1847 1848 # Header infos 1849 header_infos = self.get_header().infos 1850 1851 log.debug( 1852 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1853 ) 1854 1855 sql_info_alter_table_array = [] 1856 1857 # Info fields to check 1858 fields_list = list(header_infos) 1859 if fields: 1860 fields_list += fields 1861 fields_list = set(fields_list) 1862 1863 # If no fields 1864 if not fields: 1865 fields = [] 1866 1867 # Translate fields if patterns 1868 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1869 1870 for info in fields: 1871 1872 info_id_sql = prefix + info 1873 1874 if ( 1875 info in fields_list 1876 or prefix + info in fields_list 1877 or info in extra_infos 1878 ): 1879 1880 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1881 1882 if info in header_infos: 1883 info_type = header_infos[info].type 1884 info_num = header_infos[info].num 1885 else: 1886 info_type = "String" 1887 info_num = 0 1888 1889 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1890 if info_num != 1: 1891 type_sql = "VARCHAR" 1892 1893 # Add field 1894 added_column = self.add_column( 1895 table_name=table_variants, 1896 column_name=info_id_sql, 1897 column_type=type_sql, 1898 default_value="null", 1899 drop=force, 1900 ) 1901 1902 if added_column: 1903 added_columns.append(added_column) 1904 1905 if added_column or force: 1906 1907 # add field to index 1908 self.index_additionnal_fields.append(info_id_sql) 1909 1910 # Update field array 1911 if connexion_format in ["duckdb"]: 1912 update_info_field = f""" 1913 "{info_id_sql}" = 1914 CASE 1915 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1916 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1917 END 1918 """ 1919 elif connexion_format in ["sqlite"]: 1920 update_info_field = f""" 1921 "{info_id_sql}" = 1922 CASE 1923 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1924 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1925 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1926 END 1927 """ 1928 1929 sql_info_alter_table_array.append(update_info_field) 1930 1931 if sql_info_alter_table_array: 1932 1933 # By chromosomes 1934 try: 1935 chromosomes_list = list( 1936 self.get_query_to_df( 1937 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1938 )["#CHROM"] 1939 ) 1940 except: 1941 chromosomes_list = [None] 1942 1943 for chrom in chromosomes_list: 1944 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1945 1946 # Where clause 1947 where_clause = "" 1948 if chrom and len(chromosomes_list) > 1: 1949 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1950 1951 # Update table 1952 if proccess_all_fields_together: 1953 sql_info_alter_table_array_join = ", ".join( 1954 sql_info_alter_table_array 1955 ) 1956 if sql_info_alter_table_array_join: 1957 sql_info_alter_table = f""" 1958 UPDATE {table_variants} 1959 SET {sql_info_alter_table_array_join} 1960 {where_clause} 1961 """ 1962 log.debug( 1963 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1964 ) 1965 # log.debug(sql_info_alter_table) 1966 self.conn.execute(sql_info_alter_table) 1967 else: 1968 sql_info_alter_num = 0 1969 for sql_info_alter in sql_info_alter_table_array: 1970 sql_info_alter_num += 1 1971 sql_info_alter_table = f""" 1972 UPDATE {table_variants} 1973 SET {sql_info_alter} 1974 {where_clause} 1975 """ 1976 log.debug( 1977 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1978 ) 1979 # log.debug(sql_info_alter_table) 1980 self.conn.execute(sql_info_alter_table) 1981 1982 # create indexes 1983 if create_index: 1984 self.create_indexes() 1985 1986 return added_columns 1987 1988 def create_indexes(self) -> None: 1989 """ 1990 Create indexes on the table after insertion 1991 """ 1992 1993 # Access 1994 access = self.get_config().get("access", None) 1995 1996 # get table variants 1997 table_variants = self.get_table_variants("FROM") 1998 1999 if self.get_indexing() and access not in ["RO"]: 2000 # Create index 2001 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2002 self.conn.execute(sql_create_table_index) 2003 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2004 self.conn.execute(sql_create_table_index) 2005 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2006 self.conn.execute(sql_create_table_index) 2007 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2008 self.conn.execute(sql_create_table_index) 2009 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2010 self.conn.execute(sql_create_table_index) 2011 for field in self.index_additionnal_fields: 2012 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2013 self.conn.execute(sql_create_table_index) 2014 2015 def drop_indexes(self) -> None: 2016 """ 2017 Create indexes on the table after insertion 2018 """ 2019 2020 # Access 2021 access = self.get_config().get("access", None) 2022 2023 # get table variants 2024 table_variants = self.get_table_variants("FROM") 2025 2026 # Get database format 2027 connexion_format = self.get_connexion_format() 2028 2029 if access not in ["RO"]: 2030 if connexion_format in ["duckdb"]: 2031 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2032 elif connexion_format in ["sqlite"]: 2033 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2034 2035 list_indexes = self.conn.execute(sql_list_indexes) 2036 index_names = [row[0] for row in list_indexes.fetchall()] 2037 for index in index_names: 2038 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2039 self.conn.execute(sql_drop_table_index) 2040 2041 def read_vcf_header(self, f) -> list: 2042 """ 2043 It reads the header of a VCF file and returns a list of the header lines 2044 2045 :param f: the file object 2046 :return: The header lines of the VCF file. 2047 """ 2048 2049 header_list = [] 2050 for line in f: 2051 header_list.append(line) 2052 if line.startswith("#CHROM"): 2053 break 2054 return header_list 2055 2056 def read_vcf_header_file(self, file: str = None) -> list: 2057 """ 2058 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2059 uncompressed files. 2060 2061 :param file: The `file` parameter is a string that represents the path to the VCF header file 2062 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2063 default to `None` 2064 :type file: str 2065 :return: The function `read_vcf_header_file` returns a list. 2066 """ 2067 2068 if self.get_input_compressed(input_file=file): 2069 with bgzf.open(file, "rt") as f: 2070 return self.read_vcf_header(f=f) 2071 else: 2072 with open(file, "rt") as f: 2073 return self.read_vcf_header(f=f) 2074 2075 def execute_query(self, query: str): 2076 """ 2077 It takes a query as an argument, executes it, and returns the results 2078 2079 :param query: The query to be executed 2080 :return: The result of the query is being returned. 2081 """ 2082 if query: 2083 return self.conn.execute(query) # .fetchall() 2084 else: 2085 return None 2086 2087 def export_output( 2088 self, 2089 output_file: str | None = None, 2090 output_header: str | None = None, 2091 export_header: bool = True, 2092 query: str | None = None, 2093 parquet_partitions: list | None = None, 2094 chunk_size: int | None = None, 2095 threads: int | None = None, 2096 sort: bool = False, 2097 index: bool = False, 2098 order_by: str | None = None, 2099 ) -> bool: 2100 """ 2101 The `export_output` function exports data from a VCF file to a specified output file in various 2102 formats, including VCF, CSV, TSV, PSV, and Parquet. 2103 2104 :param output_file: The `output_file` parameter is a string that specifies the name of the 2105 output file to be generated by the function. This is where the exported data will be saved 2106 :type output_file: str 2107 :param output_header: The `output_header` parameter is a string that specifies the name of the 2108 file where the header of the VCF file will be exported. If this parameter is not provided, the 2109 header will be exported to a file with the same name as the `output_file` parameter, but with 2110 the extension " 2111 :type output_header: str 2112 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2113 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2114 True, the header will be exported to a file. If `export_header` is False, the header will not 2115 be, defaults to True, if output format is not VCF 2116 :type export_header: bool (optional) 2117 :param query: The `query` parameter is an optional SQL query that can be used to filter and 2118 select specific data from the VCF file before exporting it. If provided, only the data that 2119 matches the query will be exported 2120 :type query: str 2121 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2122 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2123 organize data in a hierarchical directory structure based on the values of one or more columns. 2124 This can improve query performance when working with large datasets 2125 :type parquet_partitions: list 2126 :param chunk_size: The `chunk_size` parameter specifies the number of 2127 records in batch when exporting data in Parquet format. This parameter is used for 2128 partitioning the Parquet file into multiple files. 2129 :type chunk_size: int 2130 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2131 threads to be used during the export process. It determines the level of parallelism and can 2132 improve the performance of the export operation. If not provided, the function will use the 2133 default number of threads 2134 :type threads: int 2135 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2136 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2137 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2138 False 2139 :type sort: bool (optional) 2140 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2141 created on the output file. If `index` is True, an index will be created. If `index` is False, 2142 no index will be created. The default value is False, defaults to False 2143 :type index: bool (optional) 2144 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2145 sorting the output file. This parameter is only applicable when exporting data in VCF format 2146 :type order_by: str 2147 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2148 None if it doesn't. 2149 """ 2150 2151 # Log 2152 log.info("Exporting...") 2153 2154 # Full path 2155 output_file = full_path(output_file) 2156 output_header = full_path(output_header) 2157 2158 # Config 2159 config = self.get_config() 2160 2161 # Param 2162 param = self.get_param() 2163 2164 # Tmp files to remove 2165 tmp_to_remove = [] 2166 2167 # If no output, get it 2168 if not output_file: 2169 output_file = self.get_output() 2170 2171 # If not threads 2172 if not threads: 2173 threads = self.get_threads() 2174 2175 # Auto header name with extension 2176 if export_header or output_header: 2177 if not output_header: 2178 output_header = f"{output_file}.hdr" 2179 # Export header 2180 self.export_header(output_file=output_file) 2181 2182 # Switch off export header if VCF output 2183 output_file_type = get_file_format(output_file) 2184 if output_file_type in ["vcf"]: 2185 export_header = False 2186 tmp_to_remove.append(output_header) 2187 2188 # Chunk size 2189 if not chunk_size: 2190 chunk_size = config.get("chunk_size", None) 2191 2192 # Parquet partition 2193 if not parquet_partitions: 2194 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2195 if parquet_partitions and isinstance(parquet_partitions, str): 2196 parquet_partitions = parquet_partitions.split(",") 2197 2198 # Order by 2199 if not order_by: 2200 order_by = param.get("export", {}).get("order_by", "") 2201 2202 # Header in output 2203 header_in_output = param.get("export", {}).get("include_header", False) 2204 2205 # Database 2206 database_source = self.get_connexion() 2207 2208 # Connexion format 2209 connexion_format = self.get_connexion_format() 2210 2211 # Explode infos 2212 if self.get_explode_infos(): 2213 self.explode_infos( 2214 prefix=self.get_explode_infos_prefix(), 2215 fields=self.get_explode_infos_fields(), 2216 force=False, 2217 ) 2218 2219 # if connexion_format in ["sqlite"] or query: 2220 if connexion_format in ["sqlite"]: 2221 2222 # Export in Parquet 2223 random_tmp = "".join( 2224 random.choice(string.ascii_lowercase) for i in range(10) 2225 ) 2226 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2227 tmp_to_remove.append(database_source) 2228 2229 # Table Variants 2230 table_variants = self.get_table_variants() 2231 2232 # Create export query 2233 sql_query_export_subquery = f""" 2234 SELECT * FROM {table_variants} 2235 """ 2236 2237 # Write source file 2238 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2239 2240 # Create database 2241 database = Database( 2242 database=database_source, 2243 table="variants", 2244 header_file=output_header, 2245 conn_config=self.get_connexion_config(), 2246 ) 2247 2248 # Existing colomns header 2249 existing_columns_header = database.get_header_columns_from_database(query=query) 2250 2251 # Sample list 2252 if output_file_type in ["vcf"]: 2253 get_samples = self.get_samples() 2254 get_samples_check = self.get_samples_check() 2255 samples_force = get_samples is not None 2256 sample_list = self.get_header_sample_list( 2257 check=get_samples_check, 2258 samples=get_samples, 2259 samples_force=samples_force, 2260 ) 2261 else: 2262 sample_list = None 2263 2264 # Export file 2265 database.export( 2266 output_database=output_file, 2267 output_header=output_header, 2268 existing_columns_header=existing_columns_header, 2269 parquet_partitions=parquet_partitions, 2270 chunk_size=chunk_size, 2271 threads=threads, 2272 sort=sort, 2273 index=index, 2274 header_in_output=header_in_output, 2275 order_by=order_by, 2276 query=query, 2277 export_header=export_header, 2278 sample_list=sample_list, 2279 ) 2280 2281 # Remove 2282 remove_if_exists(tmp_to_remove) 2283 2284 return (os.path.exists(output_file) or None) and ( 2285 os.path.exists(output_file) or None 2286 ) 2287 2288 def get_extra_infos(self, table: str = None) -> list: 2289 """ 2290 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2291 in the header. 2292 2293 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2294 name of the table from which you want to retrieve the extra columns that are not present in the 2295 header. If the `table` parameter is not provided when calling the function, it will default to 2296 using the variants 2297 :type table: str 2298 :return: A list of columns that are in the specified table but not in the header of the table. 2299 """ 2300 2301 header_columns = [] 2302 2303 if not table: 2304 table = self.get_table_variants(clause="from") 2305 header_columns = self.get_header_columns() 2306 2307 # Check all columns in the database 2308 query = f""" SELECT * FROM {table} LIMIT 1 """ 2309 log.debug(f"query {query}") 2310 table_columns = self.get_query_to_df(query).columns.tolist() 2311 extra_columns = [] 2312 2313 # Construct extra infos (not in header) 2314 for column in table_columns: 2315 if column not in header_columns: 2316 extra_columns.append(column) 2317 2318 return extra_columns 2319 2320 def get_extra_infos_sql(self, table: str = None) -> str: 2321 """ 2322 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2323 by double quotes 2324 2325 :param table: The name of the table to get the extra infos from. If None, the default table is 2326 used 2327 :type table: str 2328 :return: A string of the extra infos 2329 """ 2330 2331 return ", ".join( 2332 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2333 ) 2334 2335 def export_header( 2336 self, 2337 header_name: str = None, 2338 output_file: str = None, 2339 output_file_ext: str = ".hdr", 2340 clean_header: bool = True, 2341 remove_chrom_line: bool = False, 2342 ) -> str: 2343 """ 2344 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2345 specified options, and writes it to a new file. 2346 2347 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2348 this parameter is not specified, the header will be written to the output file 2349 :type header_name: str 2350 :param output_file: The `output_file` parameter in the `export_header` function is used to 2351 specify the name of the output file where the header will be written. If this parameter is not 2352 provided, the header will be written to a temporary file 2353 :type output_file: str 2354 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2355 string that represents the extension of the output header file. By default, it is set to ".hdr" 2356 if not specified by the user. This extension will be appended to the `output_file` name to 2357 create the final, defaults to .hdr 2358 :type output_file_ext: str (optional) 2359 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2360 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2361 `True`, the function will clean the header by modifying certain lines based on a specific 2362 pattern. If `clean_header`, defaults to True 2363 :type clean_header: bool (optional) 2364 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2365 boolean flag that determines whether the #CHROM line should be removed from the header before 2366 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2367 defaults to False 2368 :type remove_chrom_line: bool (optional) 2369 :return: The function `export_header` returns the name of the temporary header file that is 2370 created. 2371 """ 2372 2373 if not header_name and not output_file: 2374 output_file = self.get_output() 2375 2376 if self.get_header(): 2377 2378 # Get header object 2379 header_obj = self.get_header() 2380 2381 # Create database 2382 db_for_header = Database(database=self.get_input()) 2383 2384 # Get real columns in the file 2385 db_header_columns = db_for_header.get_columns() 2386 2387 with tempfile.TemporaryDirectory() as tmpdir: 2388 2389 # Write header file 2390 header_file_tmp = os.path.join(tmpdir, "header") 2391 f = open(header_file_tmp, "w") 2392 vcf.Writer(f, header_obj) 2393 f.close() 2394 2395 # Replace #CHROM line with rel columns 2396 header_list = db_for_header.read_header_file( 2397 header_file=header_file_tmp 2398 ) 2399 header_list[-1] = "\t".join(db_header_columns) 2400 2401 # Remove CHROM line 2402 if remove_chrom_line: 2403 header_list.pop() 2404 2405 # Clean header 2406 if clean_header: 2407 header_list_clean = [] 2408 for head in header_list: 2409 # Clean head for malformed header 2410 head_clean = head 2411 head_clean = re.subn( 2412 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2413 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2414 head_clean, 2415 2, 2416 )[0] 2417 # Write header 2418 header_list_clean.append(head_clean) 2419 header_list = header_list_clean 2420 2421 tmp_header_name = output_file + output_file_ext 2422 2423 f = open(tmp_header_name, "w") 2424 for line in header_list: 2425 f.write(line) 2426 f.close() 2427 2428 return tmp_header_name 2429 2430 def export_variant_vcf( 2431 self, 2432 vcf_file, 2433 remove_info: bool = False, 2434 add_samples: bool = True, 2435 list_samples: list = [], 2436 where_clause: str = "", 2437 index: bool = False, 2438 threads: int | None = None, 2439 ) -> bool | None: 2440 """ 2441 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2442 remove INFO field, add samples, and control compression and indexing. 2443 2444 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2445 written to. It is the output file that will contain the filtered VCF data based on the specified 2446 parameters 2447 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2448 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2449 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2450 in, defaults to False 2451 :type remove_info: bool (optional) 2452 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2453 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2454 If set to False, the samples will be removed. The default value is True, defaults to True 2455 :type add_samples: bool (optional) 2456 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2457 in the output VCF file. By default, all samples will be included. If you provide a list of 2458 samples, only those samples will be included in the output file 2459 :type list_samples: list 2460 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2461 determines whether or not to create an index for the output VCF file. If `index` is set to 2462 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2463 :type index: bool (optional) 2464 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2465 number of threads to use for exporting the VCF file. It determines how many parallel threads 2466 will be used during the export process. More threads can potentially speed up the export process 2467 by utilizing multiple cores of the processor. If 2468 :type threads: int | None 2469 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2470 method with various parameters including the output file, query, threads, sort flag, and index 2471 flag. The `export_output` method is responsible for exporting the VCF data based on the 2472 specified parameters and configurations provided in the `export_variant_vcf` function. 2473 """ 2474 2475 # Config 2476 config = self.get_config() 2477 2478 # Extract VCF 2479 log.debug("Export VCF...") 2480 2481 # Table variants 2482 table_variants = self.get_table_variants() 2483 2484 # Threads 2485 if not threads: 2486 threads = self.get_threads() 2487 2488 # Info fields 2489 if remove_info: 2490 if not isinstance(remove_info, str): 2491 remove_info = "." 2492 info_field = f"""'{remove_info}' as INFO""" 2493 else: 2494 info_field = "INFO" 2495 2496 # Samples fields 2497 if add_samples: 2498 if not list_samples: 2499 list_samples = self.get_header_sample_list() 2500 if list_samples: 2501 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2502 else: 2503 samples_fields = "" 2504 log.debug(f"samples_fields: {samples_fields}") 2505 else: 2506 samples_fields = "" 2507 2508 # Where clause 2509 if where_clause is None: 2510 where_clause = "" 2511 2512 # Variants 2513 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2514 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2515 log.debug(f"sql_query_select={sql_query_select}") 2516 2517 return self.export_output( 2518 output_file=vcf_file, 2519 output_header=None, 2520 export_header=True, 2521 query=sql_query_select, 2522 parquet_partitions=None, 2523 chunk_size=config.get("chunk_size", None), 2524 threads=threads, 2525 sort=True, 2526 index=index, 2527 order_by=None, 2528 ) 2529 2530 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2531 """ 2532 It takes a list of commands and runs them in parallel using the number of threads specified 2533 2534 :param commands: A list of commands to run 2535 :param threads: The number of threads to use, defaults to 1 (optional) 2536 """ 2537 2538 run_parallel_commands(commands, threads) 2539 2540 def get_threads(self, default: int = 1) -> int: 2541 """ 2542 This function returns the number of threads to use for a job, with a default value of 1 if not 2543 specified. 2544 2545 :param default: The `default` parameter in the `get_threads` method is used to specify the 2546 default number of threads to use if no specific value is provided. If no value is provided for 2547 the `threads` parameter in the configuration or input parameters, the `default` value will be 2548 used, defaults to 1 2549 :type default: int (optional) 2550 :return: the number of threads to use for the current job. 2551 """ 2552 2553 # Config 2554 config = self.get_config() 2555 2556 # Param 2557 param = self.get_param() 2558 2559 # Input threads 2560 input_thread = param.get("threads", config.get("threads", None)) 2561 2562 # Check threads 2563 if not input_thread: 2564 threads = default 2565 elif int(input_thread) <= 0: 2566 threads = os.cpu_count() 2567 else: 2568 threads = int(input_thread) 2569 return threads 2570 2571 def get_memory(self, default: str = None) -> str: 2572 """ 2573 This function retrieves the memory value from parameters or configuration with a default value 2574 if not found. 2575 2576 :param default: The `get_memory` function takes in a default value as a string parameter. This 2577 default value is used as a fallback in case the `memory` parameter is not provided in the 2578 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2579 the function 2580 :type default: str 2581 :return: The `get_memory` function returns a string value representing the memory parameter. If 2582 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2583 return the default value provided as an argument to the function. 2584 """ 2585 2586 # Config 2587 config = self.get_config() 2588 2589 # Param 2590 param = self.get_param() 2591 2592 # Input threads 2593 input_memory = param.get("memory", config.get("memory", None)) 2594 2595 # Check threads 2596 if input_memory: 2597 memory = input_memory 2598 else: 2599 memory = default 2600 2601 return memory 2602 2603 def update_from_vcf(self, vcf_file: str) -> None: 2604 """ 2605 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2606 2607 :param vcf_file: the path to the VCF file 2608 """ 2609 2610 connexion_format = self.get_connexion_format() 2611 2612 if connexion_format in ["duckdb"]: 2613 self.update_from_vcf_duckdb(vcf_file) 2614 elif connexion_format in ["sqlite"]: 2615 self.update_from_vcf_sqlite(vcf_file) 2616 2617 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2618 """ 2619 It takes a VCF file and updates the INFO column of the variants table in the database with the 2620 INFO column of the VCF file 2621 2622 :param vcf_file: the path to the VCF file 2623 """ 2624 2625 # varaints table 2626 table_variants = self.get_table_variants() 2627 2628 # Loading VCF into temporaire table 2629 skip = self.get_header_length(file=vcf_file) 2630 vcf_df = pd.read_csv( 2631 vcf_file, 2632 sep="\t", 2633 engine="c", 2634 skiprows=skip, 2635 header=0, 2636 low_memory=False, 2637 ) 2638 sql_query_update = f""" 2639 UPDATE {table_variants} as table_variants 2640 SET INFO = concat( 2641 CASE 2642 WHEN INFO NOT IN ('', '.') 2643 THEN INFO 2644 ELSE '' 2645 END, 2646 ( 2647 SELECT 2648 concat( 2649 CASE 2650 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2651 THEN ';' 2652 ELSE '' 2653 END 2654 , 2655 CASE 2656 WHEN table_parquet.INFO NOT IN ('','.') 2657 THEN table_parquet.INFO 2658 ELSE '' 2659 END 2660 ) 2661 FROM vcf_df as table_parquet 2662 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2663 AND table_parquet.\"POS\" = table_variants.\"POS\" 2664 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2665 AND table_parquet.\"REF\" = table_variants.\"REF\" 2666 AND table_parquet.INFO NOT IN ('','.') 2667 ) 2668 ) 2669 ; 2670 """ 2671 self.conn.execute(sql_query_update) 2672 2673 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2674 """ 2675 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2676 table, then updates the INFO column of the variants table with the INFO column of the temporary 2677 table 2678 2679 :param vcf_file: The path to the VCF file you want to update the database with 2680 """ 2681 2682 # Create a temporary table for the VCF 2683 table_vcf = "tmp_vcf" 2684 sql_create = ( 2685 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2686 ) 2687 self.conn.execute(sql_create) 2688 2689 # Loading VCF into temporaire table 2690 vcf_df = pd.read_csv( 2691 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2692 ) 2693 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2694 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2695 2696 # Update table 'variants' with VCF data 2697 # warning: CONCAT as || operator 2698 sql_query_update = f""" 2699 UPDATE variants as table_variants 2700 SET INFO = CASE 2701 WHEN INFO NOT IN ('', '.') 2702 THEN INFO 2703 ELSE '' 2704 END || 2705 ( 2706 SELECT 2707 CASE 2708 WHEN table_variants.INFO NOT IN ('','.') 2709 AND table_vcf.INFO NOT IN ('','.') 2710 THEN ';' 2711 ELSE '' 2712 END || 2713 CASE 2714 WHEN table_vcf.INFO NOT IN ('','.') 2715 THEN table_vcf.INFO 2716 ELSE '' 2717 END 2718 FROM {table_vcf} as table_vcf 2719 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2720 AND table_vcf.\"POS\" = table_variants.\"POS\" 2721 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2722 AND table_vcf.\"REF\" = table_variants.\"REF\" 2723 ) 2724 """ 2725 self.conn.execute(sql_query_update) 2726 2727 # Drop temporary table 2728 sql_drop = f"DROP TABLE {table_vcf}" 2729 self.conn.execute(sql_drop) 2730 2731 def drop_variants_table(self) -> None: 2732 """ 2733 > This function drops the variants table 2734 """ 2735 2736 table_variants = self.get_table_variants() 2737 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2738 self.conn.execute(sql_table_variants) 2739 2740 def set_variant_id( 2741 self, variant_id_column: str = "variant_id", force: bool = None 2742 ) -> str: 2743 """ 2744 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2745 `#CHROM`, `POS`, `REF`, and `ALT` columns 2746 2747 :param variant_id_column: The name of the column to be created in the variants table, defaults 2748 to variant_id 2749 :type variant_id_column: str (optional) 2750 :param force: If True, the variant_id column will be created even if it already exists 2751 :type force: bool 2752 :return: The name of the column that contains the variant_id 2753 """ 2754 2755 # Assembly 2756 assembly = self.get_param().get( 2757 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2758 ) 2759 2760 # INFO/Tag prefix 2761 prefix = self.get_explode_infos_prefix() 2762 2763 # Explode INFO/SVTYPE 2764 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2765 2766 # variants table 2767 table_variants = self.get_table_variants() 2768 2769 # variant_id column 2770 if not variant_id_column: 2771 variant_id_column = "variant_id" 2772 2773 # Creta variant_id column 2774 if "variant_id" not in self.get_extra_infos() or force: 2775 2776 # Create column 2777 self.add_column( 2778 table_name=table_variants, 2779 column_name=variant_id_column, 2780 column_type="UBIGINT", 2781 default_value="0", 2782 ) 2783 2784 # Update column 2785 self.conn.execute( 2786 f""" 2787 UPDATE {table_variants} 2788 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2789 """ 2790 ) 2791 2792 # Remove added columns 2793 for added_column in added_columns: 2794 self.drop_column(column=added_column) 2795 2796 # return variant_id column name 2797 return variant_id_column 2798 2799 def get_variant_id_column( 2800 self, variant_id_column: str = "variant_id", force: bool = None 2801 ) -> str: 2802 """ 2803 This function returns the variant_id column name 2804 2805 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2806 defaults to variant_id 2807 :type variant_id_column: str (optional) 2808 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2809 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2810 if it is not already set, or if it is set 2811 :type force: bool 2812 :return: The variant_id column name. 2813 """ 2814 2815 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2816 2817 ### 2818 # Annotation 2819 ### 2820 2821 def scan_databases( 2822 self, 2823 database_formats: list = ["parquet"], 2824 database_releases: list = ["current"], 2825 ) -> dict: 2826 """ 2827 The function `scan_databases` scans for available databases based on specified formats and 2828 releases. 2829 2830 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2831 of the databases to be scanned. In this case, the accepted format is "parquet" 2832 :type database_formats: list ["parquet"] 2833 :param database_releases: The `database_releases` parameter is a list that specifies the 2834 releases of the databases to be scanned. In the provided function, the default value for 2835 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2836 databases that are in the "current" 2837 :type database_releases: list 2838 :return: The function `scan_databases` returns a dictionary containing information about 2839 databases that match the specified formats and releases. 2840 """ 2841 2842 # Config 2843 config = self.get_config() 2844 2845 # Param 2846 param = self.get_param() 2847 2848 # Param - Assembly 2849 assembly = param.get("assembly", config.get("assembly", None)) 2850 if not assembly: 2851 assembly = DEFAULT_ASSEMBLY 2852 log.warning(f"Default assembly '{assembly}'") 2853 2854 # Scan for availabled databases 2855 log.info( 2856 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2857 ) 2858 databases_infos_dict = databases_infos( 2859 database_folder_releases=database_releases, 2860 database_formats=database_formats, 2861 assembly=assembly, 2862 config=config, 2863 ) 2864 log.info( 2865 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2866 ) 2867 2868 return databases_infos_dict 2869 2870 def annotation(self) -> None: 2871 """ 2872 It annotates the VCF file with the annotations specified in the config file. 2873 """ 2874 2875 # Config 2876 config = self.get_config() 2877 2878 # Param 2879 param = self.get_param() 2880 2881 # Param - Assembly 2882 assembly = param.get("assembly", config.get("assembly", None)) 2883 if not assembly: 2884 assembly = DEFAULT_ASSEMBLY 2885 log.warning(f"Default assembly '{assembly}'") 2886 2887 # annotations databases folders 2888 annotations_databases = set( 2889 config.get("folders", {}) 2890 .get("databases", {}) 2891 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2892 + config.get("folders", {}) 2893 .get("databases", {}) 2894 .get("parquet", ["~/howard/databases/parquet/current"]) 2895 + config.get("folders", {}) 2896 .get("databases", {}) 2897 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2898 ) 2899 2900 # Get param annotations 2901 if param.get("annotations", None) and isinstance( 2902 param.get("annotations", None), str 2903 ): 2904 log.debug(param.get("annotations", None)) 2905 param_annotation_list = param.get("annotations").split(",") 2906 else: 2907 param_annotation_list = [] 2908 2909 # Each tools param 2910 if param.get("annotation_parquet", None) != None: 2911 log.debug( 2912 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2913 ) 2914 if isinstance(param.get("annotation_parquet", None), list): 2915 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2916 else: 2917 param_annotation_list.append(param.get("annotation_parquet")) 2918 if param.get("annotation_snpsift", None) != None: 2919 if isinstance(param.get("annotation_snpsift", None), list): 2920 param_annotation_list.append( 2921 "snpsift:" 2922 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2923 ) 2924 else: 2925 param_annotation_list.append( 2926 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2927 ) 2928 if param.get("annotation_snpeff", None) != None: 2929 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2930 if param.get("annotation_bcftools", None) != None: 2931 if isinstance(param.get("annotation_bcftools", None), list): 2932 param_annotation_list.append( 2933 "bcftools:" 2934 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2935 ) 2936 else: 2937 param_annotation_list.append( 2938 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2939 ) 2940 if param.get("annotation_annovar", None) != None: 2941 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2942 if param.get("annotation_exomiser", None) != None: 2943 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2944 if param.get("annotation_splice", None) != None: 2945 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2946 2947 # Merge param annotations list 2948 param["annotations"] = ",".join(param_annotation_list) 2949 2950 # debug 2951 log.debug(f"param_annotations={param['annotations']}") 2952 2953 if param.get("annotations"): 2954 2955 # Log 2956 # log.info("Annotations - Check annotation parameters") 2957 2958 if not "annotation" in param: 2959 param["annotation"] = {} 2960 2961 # List of annotations parameters 2962 annotations_list_input = {} 2963 if isinstance(param.get("annotations", None), str): 2964 annotation_file_list = [ 2965 value for value in param.get("annotations", "").split(",") 2966 ] 2967 for annotation_file in annotation_file_list: 2968 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2969 else: 2970 annotations_list_input = param.get("annotations", {}) 2971 2972 log.info(f"Quick Annotations:") 2973 for annotation_key in list(annotations_list_input.keys()): 2974 log.info(f" {annotation_key}") 2975 2976 # List of annotations and associated fields 2977 annotations_list = {} 2978 2979 for annotation_file in annotations_list_input: 2980 2981 # Explode annotations if ALL 2982 if ( 2983 annotation_file.upper() == "ALL" 2984 or annotation_file.upper().startswith("ALL:") 2985 ): 2986 2987 # check ALL parameters (formats, releases) 2988 annotation_file_split = annotation_file.split(":") 2989 database_formats = "parquet" 2990 database_releases = "current" 2991 for annotation_file_option in annotation_file_split[1:]: 2992 database_all_options_split = annotation_file_option.split("=") 2993 if database_all_options_split[0] == "format": 2994 database_formats = database_all_options_split[1].split("+") 2995 if database_all_options_split[0] == "release": 2996 database_releases = database_all_options_split[1].split("+") 2997 2998 # Scan for availabled databases 2999 databases_infos_dict = self.scan_databases( 3000 database_formats=database_formats, 3001 database_releases=database_releases, 3002 ) 3003 3004 # Add found databases in annotation parameters 3005 for database_infos in databases_infos_dict.keys(): 3006 annotations_list[database_infos] = {"INFO": None} 3007 3008 else: 3009 annotations_list[annotation_file] = annotations_list_input[ 3010 annotation_file 3011 ] 3012 3013 # Check each databases 3014 if len(annotations_list): 3015 3016 log.info( 3017 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3018 ) 3019 3020 for annotation_file in annotations_list: 3021 3022 # Init 3023 annotations = annotations_list.get(annotation_file, None) 3024 3025 # Annotation snpEff 3026 if annotation_file.startswith("snpeff"): 3027 3028 log.debug(f"Quick Annotation snpEff") 3029 3030 if "snpeff" not in param["annotation"]: 3031 param["annotation"]["snpeff"] = {} 3032 3033 if "options" not in param["annotation"]["snpeff"]: 3034 param["annotation"]["snpeff"]["options"] = "" 3035 3036 # snpEff options in annotations 3037 param["annotation"]["snpeff"]["options"] = "".join( 3038 annotation_file.split(":")[1:] 3039 ) 3040 3041 # Annotation Annovar 3042 elif annotation_file.startswith("annovar"): 3043 3044 log.debug(f"Quick Annotation Annovar") 3045 3046 if "annovar" not in param["annotation"]: 3047 param["annotation"]["annovar"] = {} 3048 3049 if "annotations" not in param["annotation"]["annovar"]: 3050 param["annotation"]["annovar"]["annotations"] = {} 3051 3052 # Options 3053 annotation_file_split = annotation_file.split(":") 3054 for annotation_file_annotation in annotation_file_split[1:]: 3055 if annotation_file_annotation: 3056 param["annotation"]["annovar"]["annotations"][ 3057 annotation_file_annotation 3058 ] = annotations 3059 3060 # Annotation Exomiser 3061 elif annotation_file.startswith("exomiser"): 3062 3063 log.debug(f"Quick Annotation Exomiser") 3064 3065 param["annotation"]["exomiser"] = params_string_to_dict( 3066 annotation_file 3067 ) 3068 3069 # Annotation Splice 3070 elif annotation_file.startswith("splice"): 3071 3072 log.debug(f"Quick Annotation Splice") 3073 3074 param["annotation"]["splice"] = params_string_to_dict( 3075 annotation_file 3076 ) 3077 3078 # Annotation Parquet or BCFTOOLS 3079 else: 3080 3081 # Tools detection 3082 if annotation_file.startswith("bcftools:"): 3083 annotation_tool_initial = "bcftools" 3084 annotation_file = ":".join(annotation_file.split(":")[1:]) 3085 elif annotation_file.startswith("snpsift:"): 3086 annotation_tool_initial = "snpsift" 3087 annotation_file = ":".join(annotation_file.split(":")[1:]) 3088 elif annotation_file.startswith("bigwig:"): 3089 annotation_tool_initial = "bigwig" 3090 annotation_file = ":".join(annotation_file.split(":")[1:]) 3091 else: 3092 annotation_tool_initial = None 3093 3094 # list of files 3095 annotation_file_list = annotation_file.replace("+", ":").split( 3096 ":" 3097 ) 3098 3099 for annotation_file in annotation_file_list: 3100 3101 if annotation_file: 3102 3103 # Annotation tool initial 3104 annotation_tool = annotation_tool_initial 3105 3106 # Find file 3107 annotation_file_found = None 3108 3109 if os.path.exists(annotation_file): 3110 annotation_file_found = annotation_file 3111 elif os.path.exists(full_path(annotation_file)): 3112 annotation_file_found = full_path(annotation_file) 3113 else: 3114 # Find within assembly folders 3115 for annotations_database in annotations_databases: 3116 found_files = find_all( 3117 annotation_file, 3118 os.path.join( 3119 annotations_database, assembly 3120 ), 3121 ) 3122 if len(found_files) > 0: 3123 annotation_file_found = found_files[0] 3124 break 3125 if not annotation_file_found and not assembly: 3126 # Find within folders 3127 for ( 3128 annotations_database 3129 ) in annotations_databases: 3130 found_files = find_all( 3131 annotation_file, annotations_database 3132 ) 3133 if len(found_files) > 0: 3134 annotation_file_found = found_files[0] 3135 break 3136 log.debug( 3137 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3138 ) 3139 3140 # Full path 3141 annotation_file_found = full_path(annotation_file_found) 3142 3143 if annotation_file_found: 3144 3145 database = Database(database=annotation_file_found) 3146 quick_annotation_format = database.get_format() 3147 quick_annotation_is_compressed = ( 3148 database.is_compressed() 3149 ) 3150 quick_annotation_is_indexed = os.path.exists( 3151 f"{annotation_file_found}.tbi" 3152 ) 3153 bcftools_preference = False 3154 3155 # Check Annotation Tool 3156 if not annotation_tool: 3157 if ( 3158 bcftools_preference 3159 and quick_annotation_format 3160 in ["vcf", "bed"] 3161 and quick_annotation_is_compressed 3162 and quick_annotation_is_indexed 3163 ): 3164 annotation_tool = "bcftools" 3165 elif quick_annotation_format in [ 3166 "vcf", 3167 "bed", 3168 "tsv", 3169 "tsv", 3170 "csv", 3171 "json", 3172 "tbl", 3173 "parquet", 3174 "duckdb", 3175 ]: 3176 annotation_tool = "parquet" 3177 elif quick_annotation_format in ["bw"]: 3178 annotation_tool = "bigwig" 3179 else: 3180 log.error( 3181 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3182 ) 3183 raise ValueError( 3184 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3185 ) 3186 3187 log.debug( 3188 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3189 ) 3190 3191 # Annotation Tool dispatch 3192 if annotation_tool: 3193 if annotation_tool not in param["annotation"]: 3194 param["annotation"][annotation_tool] = {} 3195 if ( 3196 "annotations" 3197 not in param["annotation"][annotation_tool] 3198 ): 3199 param["annotation"][annotation_tool][ 3200 "annotations" 3201 ] = {} 3202 param["annotation"][annotation_tool][ 3203 "annotations" 3204 ][annotation_file_found] = annotations 3205 3206 else: 3207 log.warning( 3208 f"Quick Annotation File {annotation_file} does NOT exist" 3209 ) 3210 3211 self.set_param(param) 3212 3213 if param.get("annotation", None): 3214 log.info("Annotations") 3215 if param.get("annotation", {}).get("parquet", None): 3216 log.info("Annotations 'parquet'...") 3217 self.annotation_parquet() 3218 if param.get("annotation", {}).get("bcftools", None): 3219 log.info("Annotations 'bcftools'...") 3220 self.annotation_bcftools() 3221 if param.get("annotation", {}).get("snpsift", None): 3222 log.info("Annotations 'snpsift'...") 3223 self.annotation_snpsift() 3224 if param.get("annotation", {}).get("bigwig", None): 3225 log.info("Annotations 'bigwig'...") 3226 self.annotation_bigwig() 3227 if param.get("annotation", {}).get("annovar", None): 3228 log.info("Annotations 'annovar'...") 3229 self.annotation_annovar() 3230 if param.get("annotation", {}).get("snpeff", None): 3231 log.info("Annotations 'snpeff'...") 3232 self.annotation_snpeff() 3233 if param.get("annotation", {}).get("exomiser", None) is not None: 3234 log.info("Annotations 'exomiser'...") 3235 self.annotation_exomiser() 3236 if param.get("annotation", {}).get("splice", None) is not None: 3237 log.info("Annotations 'splice' ...") 3238 self.annotation_splice() 3239 3240 # Explode INFOS fields into table fields 3241 if self.get_explode_infos(): 3242 self.explode_infos( 3243 prefix=self.get_explode_infos_prefix(), 3244 fields=self.get_explode_infos_fields(), 3245 force=True, 3246 ) 3247 3248 def annotation_bigwig(self, threads: int = None) -> None: 3249 """ 3250 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3251 3252 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3253 number of threads to be used for parallel processing during the annotation process. If the 3254 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3255 threads to use based on the system configuration 3256 :type threads: int 3257 :return: True 3258 """ 3259 3260 # DEBUG 3261 log.debug("Start annotation with bigwig databases") 3262 3263 # # Threads 3264 # if not threads: 3265 # threads = self.get_threads() 3266 # log.debug("Threads: " + str(threads)) 3267 3268 # Config 3269 config = self.get_config() 3270 log.debug("Config: " + str(config)) 3271 3272 # Config - BCFTools databases folders 3273 databases_folders = set( 3274 self.get_config() 3275 .get("folders", {}) 3276 .get("databases", {}) 3277 .get("annotations", ["."]) 3278 + self.get_config() 3279 .get("folders", {}) 3280 .get("databases", {}) 3281 .get("bigwig", ["."]) 3282 ) 3283 log.debug("Databases annotations: " + str(databases_folders)) 3284 3285 # Param 3286 annotations = ( 3287 self.get_param() 3288 .get("annotation", {}) 3289 .get("bigwig", {}) 3290 .get("annotations", None) 3291 ) 3292 log.debug("Annotations: " + str(annotations)) 3293 3294 # Assembly 3295 assembly = self.get_param().get( 3296 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3297 ) 3298 3299 # Data 3300 table_variants = self.get_table_variants() 3301 3302 # Check if not empty 3303 log.debug("Check if not empty") 3304 sql_query_chromosomes = ( 3305 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3306 ) 3307 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3308 if not sql_query_chromosomes_df["count"][0]: 3309 log.info(f"VCF empty") 3310 return 3311 3312 # VCF header 3313 vcf_reader = self.get_header() 3314 log.debug("Initial header: " + str(vcf_reader.infos)) 3315 3316 # Existing annotations 3317 for vcf_annotation in self.get_header().infos: 3318 3319 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3320 log.debug( 3321 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3322 ) 3323 3324 if annotations: 3325 3326 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3327 3328 # Export VCF file 3329 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3330 3331 # annotation_bigwig_config 3332 annotation_bigwig_config_list = [] 3333 3334 for annotation in annotations: 3335 annotation_fields = annotations[annotation] 3336 3337 # Annotation Name 3338 annotation_name = os.path.basename(annotation) 3339 3340 if not annotation_fields: 3341 annotation_fields = {"INFO": None} 3342 3343 log.debug(f"Annotation '{annotation_name}'") 3344 log.debug( 3345 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3346 ) 3347 3348 # Create Database 3349 database = Database( 3350 database=annotation, 3351 databases_folders=databases_folders, 3352 assembly=assembly, 3353 ) 3354 3355 # Find files 3356 db_file = database.get_database() 3357 db_file = full_path(db_file) 3358 db_hdr_file = database.get_header_file() 3359 db_hdr_file = full_path(db_hdr_file) 3360 db_file_type = database.get_format() 3361 3362 # If db_file is http ? 3363 if database.get_database().startswith("http"): 3364 3365 # Datbase is HTTP URL 3366 db_file_is_http = True 3367 3368 # DB file keep as URL 3369 db_file = database.get_database() 3370 log.warning( 3371 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3372 ) 3373 3374 # Retrieve automatic annotation field name 3375 annotation_field = clean_annotation_field( 3376 os.path.basename(db_file).replace(".bw", "") 3377 ) 3378 log.debug( 3379 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3380 ) 3381 3382 # Create automatic header file 3383 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3384 with open(db_hdr_file, "w") as f: 3385 f.write("##fileformat=VCFv4.2\n") 3386 f.write( 3387 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3388 ) 3389 f.write(f"#CHROM START END {annotation_field}\n") 3390 3391 else: 3392 3393 # Datbase is NOT HTTP URL 3394 db_file_is_http = False 3395 3396 # Check index - try to create if not exists 3397 if ( 3398 db_file is None 3399 or db_hdr_file is None 3400 or (not os.path.exists(db_file) and not db_file_is_http) 3401 or not os.path.exists(db_hdr_file) 3402 or not db_file_type in ["bw"] 3403 ): 3404 # if False: 3405 log.error("Annotation failed: database not valid") 3406 log.error(f"Annotation annotation file: {db_file}") 3407 log.error(f"Annotation annotation file type: {db_file_type}") 3408 log.error(f"Annotation annotation header: {db_hdr_file}") 3409 raise ValueError( 3410 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3411 ) 3412 else: 3413 3414 # Log 3415 log.debug( 3416 f"Annotation '{annotation}' - file: " 3417 + str(db_file) 3418 + " and " 3419 + str(db_hdr_file) 3420 ) 3421 3422 # Load header as VCF object 3423 db_hdr_vcf = Variants(input=db_hdr_file) 3424 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3425 log.debug( 3426 "Annotation database header: " 3427 + str(db_hdr_vcf_header_infos) 3428 ) 3429 3430 # For all fields in database 3431 annotation_fields_full = False 3432 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3433 annotation_fields = { 3434 key: key for key in db_hdr_vcf_header_infos 3435 } 3436 log.debug( 3437 "Annotation database header - All annotations added: " 3438 + str(annotation_fields) 3439 ) 3440 annotation_fields_full = True 3441 3442 # Init 3443 cyvcf2_header_rename_dict = {} 3444 cyvcf2_header_list = [] 3445 cyvcf2_header_indexes = {} 3446 3447 # process annotation fields 3448 for annotation_field in annotation_fields: 3449 3450 # New annotation name 3451 annotation_field_new = annotation_fields[annotation_field] 3452 3453 # Check annotation field and index in header 3454 if ( 3455 annotation_field 3456 in db_hdr_vcf.get_header_columns_as_list() 3457 ): 3458 annotation_field_index = ( 3459 db_hdr_vcf.get_header_columns_as_list().index( 3460 annotation_field 3461 ) 3462 - 3 3463 ) 3464 cyvcf2_header_indexes[annotation_field_new] = ( 3465 annotation_field_index 3466 ) 3467 else: 3468 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3469 log.error(msg_err) 3470 raise ValueError(msg_err) 3471 3472 # Append annotation field in cyvcf2 header list 3473 cyvcf2_header_rename_dict[annotation_field_new] = ( 3474 db_hdr_vcf_header_infos[annotation_field].id 3475 ) 3476 cyvcf2_header_list.append( 3477 { 3478 "ID": annotation_field_new, 3479 "Number": db_hdr_vcf_header_infos[ 3480 annotation_field 3481 ].num, 3482 "Type": db_hdr_vcf_header_infos[ 3483 annotation_field 3484 ].type, 3485 "Description": db_hdr_vcf_header_infos[ 3486 annotation_field 3487 ].desc, 3488 } 3489 ) 3490 3491 # Add header on VCF 3492 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3493 annotation_field_new, 3494 db_hdr_vcf_header_infos[annotation_field].num, 3495 db_hdr_vcf_header_infos[annotation_field].type, 3496 db_hdr_vcf_header_infos[annotation_field].desc, 3497 "HOWARD BigWig annotation", 3498 "unknown", 3499 self.code_type_map[ 3500 db_hdr_vcf_header_infos[annotation_field].type 3501 ], 3502 ) 3503 3504 # Load bigwig database 3505 bw_db = pyBigWig.open(db_file) 3506 if bw_db.isBigWig(): 3507 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3508 else: 3509 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3510 log.error(msg_err) 3511 raise ValueError(msg_err) 3512 3513 annotation_bigwig_config_list.append( 3514 { 3515 "db_file": db_file, 3516 "bw_db": bw_db, 3517 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3518 "cyvcf2_header_list": cyvcf2_header_list, 3519 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3520 } 3521 ) 3522 3523 # Annotate 3524 if annotation_bigwig_config_list: 3525 3526 # Annotation config 3527 log.debug( 3528 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3529 ) 3530 3531 # Export VCF file 3532 self.export_variant_vcf( 3533 vcf_file=tmp_vcf_name, 3534 remove_info=True, 3535 add_samples=False, 3536 index=True, 3537 ) 3538 3539 # Load input tmp file 3540 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3541 3542 # Add header in input file 3543 for annotation_bigwig_config in annotation_bigwig_config_list: 3544 for cyvcf2_header_field in annotation_bigwig_config.get( 3545 "cyvcf2_header_list", [] 3546 ): 3547 log.info( 3548 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3549 ) 3550 input_vcf.add_info_to_header(cyvcf2_header_field) 3551 3552 # Create output VCF file 3553 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3554 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3555 3556 # Fetch variants 3557 log.info(f"Annotations 'bigwig' start...") 3558 for variant in input_vcf: 3559 3560 for annotation_bigwig_config in annotation_bigwig_config_list: 3561 3562 # DB and indexes 3563 bw_db = annotation_bigwig_config.get("bw_db", None) 3564 cyvcf2_header_indexes = annotation_bigwig_config.get( 3565 "cyvcf2_header_indexes", None 3566 ) 3567 3568 # Retrieve value from chrom pos 3569 res = bw_db.values( 3570 variant.CHROM, variant.POS - 1, variant.POS 3571 ) 3572 3573 # For each annotation fields (and indexes) 3574 for cyvcf2_header_index in cyvcf2_header_indexes: 3575 3576 # If value is NOT nNone 3577 if not np.isnan( 3578 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3579 ): 3580 variant.INFO[cyvcf2_header_index] = res[ 3581 cyvcf2_header_indexes[cyvcf2_header_index] 3582 ] 3583 3584 # Add record in output file 3585 output_vcf.write_record(variant) 3586 3587 # Log 3588 log.debug(f"Annotation done.") 3589 3590 # Close and write file 3591 log.info(f"Annotations 'bigwig' write...") 3592 output_vcf.close() 3593 log.debug(f"Write done.") 3594 3595 # Update variants 3596 log.info(f"Annotations 'bigwig' update...") 3597 self.update_from_vcf(output_vcf_file) 3598 log.debug(f"Update done.") 3599 3600 return True 3601 3602 def annotation_snpsift(self, threads: int = None) -> None: 3603 """ 3604 This function annotate with bcftools 3605 3606 :param threads: Number of threads to use 3607 :return: the value of the variable "return_value". 3608 """ 3609 3610 # DEBUG 3611 log.debug("Start annotation with bcftools databases") 3612 3613 # Threads 3614 if not threads: 3615 threads = self.get_threads() 3616 log.debug("Threads: " + str(threads)) 3617 3618 # Config 3619 config = self.get_config() 3620 log.debug("Config: " + str(config)) 3621 3622 # Config - snpSift 3623 snpsift_bin_command = get_bin_command( 3624 bin="SnpSift.jar", 3625 tool="snpsift", 3626 bin_type="jar", 3627 config=config, 3628 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3629 ) 3630 if not snpsift_bin_command: 3631 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3632 log.error(msg_err) 3633 raise ValueError(msg_err) 3634 3635 # Config - bcftools 3636 bcftools_bin_command = get_bin_command( 3637 bin="bcftools", 3638 tool="bcftools", 3639 bin_type="bin", 3640 config=config, 3641 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3642 ) 3643 if not bcftools_bin_command: 3644 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3645 log.error(msg_err) 3646 raise ValueError(msg_err) 3647 3648 # Config - BCFTools databases folders 3649 databases_folders = set( 3650 self.get_config() 3651 .get("folders", {}) 3652 .get("databases", {}) 3653 .get("annotations", ["."]) 3654 + self.get_config() 3655 .get("folders", {}) 3656 .get("databases", {}) 3657 .get("bcftools", ["."]) 3658 ) 3659 log.debug("Databases annotations: " + str(databases_folders)) 3660 3661 # Param 3662 annotations = ( 3663 self.get_param() 3664 .get("annotation", {}) 3665 .get("snpsift", {}) 3666 .get("annotations", None) 3667 ) 3668 log.debug("Annotations: " + str(annotations)) 3669 3670 # Assembly 3671 assembly = self.get_param().get( 3672 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3673 ) 3674 3675 # Data 3676 table_variants = self.get_table_variants() 3677 3678 # Check if not empty 3679 log.debug("Check if not empty") 3680 sql_query_chromosomes = ( 3681 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3682 ) 3683 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3684 if not sql_query_chromosomes_df["count"][0]: 3685 log.info(f"VCF empty") 3686 return 3687 3688 # VCF header 3689 vcf_reader = self.get_header() 3690 log.debug("Initial header: " + str(vcf_reader.infos)) 3691 3692 # Existing annotations 3693 for vcf_annotation in self.get_header().infos: 3694 3695 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3696 log.debug( 3697 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3698 ) 3699 3700 if annotations: 3701 3702 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3703 3704 # Export VCF file 3705 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3706 3707 # Init 3708 commands = {} 3709 3710 for annotation in annotations: 3711 annotation_fields = annotations[annotation] 3712 3713 # Annotation Name 3714 annotation_name = os.path.basename(annotation) 3715 3716 if not annotation_fields: 3717 annotation_fields = {"INFO": None} 3718 3719 log.debug(f"Annotation '{annotation_name}'") 3720 log.debug( 3721 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3722 ) 3723 3724 # Create Database 3725 database = Database( 3726 database=annotation, 3727 databases_folders=databases_folders, 3728 assembly=assembly, 3729 ) 3730 3731 # Find files 3732 db_file = database.get_database() 3733 db_file = full_path(db_file) 3734 db_hdr_file = database.get_header_file() 3735 db_hdr_file = full_path(db_hdr_file) 3736 db_file_type = database.get_format() 3737 db_tbi_file = f"{db_file}.tbi" 3738 db_file_compressed = database.is_compressed() 3739 3740 # Check if compressed 3741 if not db_file_compressed: 3742 log.error( 3743 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3744 ) 3745 raise ValueError( 3746 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3747 ) 3748 3749 # Check if indexed 3750 if not os.path.exists(db_tbi_file): 3751 log.error( 3752 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3753 ) 3754 raise ValueError( 3755 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3756 ) 3757 3758 # Check index - try to create if not exists 3759 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3760 log.error("Annotation failed: database not valid") 3761 log.error(f"Annotation annotation file: {db_file}") 3762 log.error(f"Annotation annotation header: {db_hdr_file}") 3763 log.error(f"Annotation annotation index: {db_tbi_file}") 3764 raise ValueError( 3765 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3766 ) 3767 else: 3768 3769 log.debug( 3770 f"Annotation '{annotation}' - file: " 3771 + str(db_file) 3772 + " and " 3773 + str(db_hdr_file) 3774 ) 3775 3776 # Load header as VCF object 3777 db_hdr_vcf = Variants(input=db_hdr_file) 3778 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3779 log.debug( 3780 "Annotation database header: " 3781 + str(db_hdr_vcf_header_infos) 3782 ) 3783 3784 # For all fields in database 3785 annotation_fields_full = False 3786 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3787 annotation_fields = { 3788 key: key for key in db_hdr_vcf_header_infos 3789 } 3790 log.debug( 3791 "Annotation database header - All annotations added: " 3792 + str(annotation_fields) 3793 ) 3794 annotation_fields_full = True 3795 3796 # # Create file for field rename 3797 # log.debug("Create file for field rename") 3798 # tmp_rename = NamedTemporaryFile( 3799 # prefix=self.get_prefix(), 3800 # dir=self.get_tmp_dir(), 3801 # suffix=".rename", 3802 # delete=False, 3803 # ) 3804 # tmp_rename_name = tmp_rename.name 3805 # tmp_files.append(tmp_rename_name) 3806 3807 # Number of fields 3808 nb_annotation_field = 0 3809 annotation_list = [] 3810 annotation_infos_rename_list = [] 3811 3812 for annotation_field in annotation_fields: 3813 3814 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3815 annotation_fields_new_name = annotation_fields.get( 3816 annotation_field, annotation_field 3817 ) 3818 if not annotation_fields_new_name: 3819 annotation_fields_new_name = annotation_field 3820 3821 # Check if field is in DB and if field is not elready in input data 3822 if ( 3823 annotation_field in db_hdr_vcf.get_header().infos 3824 and annotation_fields_new_name 3825 not in self.get_header().infos 3826 ): 3827 3828 log.info( 3829 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3830 ) 3831 3832 # BCFTools annotate param to rename fields 3833 if annotation_field != annotation_fields_new_name: 3834 annotation_infos_rename_list.append( 3835 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3836 ) 3837 3838 # Add INFO field to header 3839 db_hdr_vcf_header_infos_number = ( 3840 db_hdr_vcf_header_infos[annotation_field].num or "." 3841 ) 3842 db_hdr_vcf_header_infos_type = ( 3843 db_hdr_vcf_header_infos[annotation_field].type 3844 or "String" 3845 ) 3846 db_hdr_vcf_header_infos_description = ( 3847 db_hdr_vcf_header_infos[annotation_field].desc 3848 or f"{annotation_field} description" 3849 ) 3850 db_hdr_vcf_header_infos_source = ( 3851 db_hdr_vcf_header_infos[annotation_field].source 3852 or "unknown" 3853 ) 3854 db_hdr_vcf_header_infos_version = ( 3855 db_hdr_vcf_header_infos[annotation_field].version 3856 or "unknown" 3857 ) 3858 3859 vcf_reader.infos[annotation_fields_new_name] = ( 3860 vcf.parser._Info( 3861 annotation_fields_new_name, 3862 db_hdr_vcf_header_infos_number, 3863 db_hdr_vcf_header_infos_type, 3864 db_hdr_vcf_header_infos_description, 3865 db_hdr_vcf_header_infos_source, 3866 db_hdr_vcf_header_infos_version, 3867 self.code_type_map[ 3868 db_hdr_vcf_header_infos_type 3869 ], 3870 ) 3871 ) 3872 3873 annotation_list.append(annotation_field) 3874 3875 nb_annotation_field += 1 3876 3877 else: 3878 3879 if ( 3880 annotation_field 3881 not in db_hdr_vcf.get_header().infos 3882 ): 3883 log.warning( 3884 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3885 ) 3886 if ( 3887 annotation_fields_new_name 3888 in self.get_header().infos 3889 ): 3890 log.warning( 3891 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3892 ) 3893 3894 log.info( 3895 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3896 ) 3897 3898 annotation_infos = ",".join(annotation_list) 3899 3900 if annotation_infos != "": 3901 3902 # Annotated VCF (and error file) 3903 tmp_annotation_vcf_name = os.path.join( 3904 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3905 ) 3906 tmp_annotation_vcf_name_err = ( 3907 tmp_annotation_vcf_name + ".err" 3908 ) 3909 3910 # Add fields to annotate 3911 if not annotation_fields_full: 3912 annotation_infos_option = f"-info {annotation_infos}" 3913 else: 3914 annotation_infos_option = "" 3915 3916 # Info fields rename 3917 if annotation_infos_rename_list: 3918 annotation_infos_rename = " -c " + ",".join( 3919 annotation_infos_rename_list 3920 ) 3921 else: 3922 annotation_infos_rename = "" 3923 3924 # Annotate command 3925 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3926 3927 # Add command 3928 commands[command_annotate] = tmp_annotation_vcf_name 3929 3930 if commands: 3931 3932 # Export VCF file 3933 self.export_variant_vcf( 3934 vcf_file=tmp_vcf_name, 3935 remove_info=True, 3936 add_samples=False, 3937 index=True, 3938 ) 3939 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3940 3941 # Num command 3942 nb_command = 0 3943 3944 # Annotate 3945 for command_annotate in commands: 3946 nb_command += 1 3947 log.info( 3948 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3949 ) 3950 log.debug(f"command_annotate={command_annotate}") 3951 run_parallel_commands([command_annotate], threads) 3952 3953 # Debug 3954 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3955 3956 # Update variants 3957 log.info( 3958 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3959 ) 3960 self.update_from_vcf(commands[command_annotate]) 3961 3962 def annotation_bcftools(self, threads: int = None) -> None: 3963 """ 3964 This function annotate with bcftools 3965 3966 :param threads: Number of threads to use 3967 :return: the value of the variable "return_value". 3968 """ 3969 3970 # DEBUG 3971 log.debug("Start annotation with bcftools databases") 3972 3973 # Threads 3974 if not threads: 3975 threads = self.get_threads() 3976 log.debug("Threads: " + str(threads)) 3977 3978 # Config 3979 config = self.get_config() 3980 log.debug("Config: " + str(config)) 3981 3982 # DEBUG 3983 delete_tmp = True 3984 if self.get_config().get("verbosity", "warning") in ["debug"]: 3985 delete_tmp = False 3986 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3987 3988 # Config - BCFTools bin command 3989 bcftools_bin_command = get_bin_command( 3990 bin="bcftools", 3991 tool="bcftools", 3992 bin_type="bin", 3993 config=config, 3994 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3995 ) 3996 if not bcftools_bin_command: 3997 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3998 log.error(msg_err) 3999 raise ValueError(msg_err) 4000 4001 # Config - BCFTools databases folders 4002 databases_folders = set( 4003 self.get_config() 4004 .get("folders", {}) 4005 .get("databases", {}) 4006 .get("annotations", ["."]) 4007 + self.get_config() 4008 .get("folders", {}) 4009 .get("databases", {}) 4010 .get("bcftools", ["."]) 4011 ) 4012 log.debug("Databases annotations: " + str(databases_folders)) 4013 4014 # Param 4015 annotations = ( 4016 self.get_param() 4017 .get("annotation", {}) 4018 .get("bcftools", {}) 4019 .get("annotations", None) 4020 ) 4021 log.debug("Annotations: " + str(annotations)) 4022 4023 # Assembly 4024 assembly = self.get_param().get( 4025 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4026 ) 4027 4028 # Data 4029 table_variants = self.get_table_variants() 4030 4031 # Check if not empty 4032 log.debug("Check if not empty") 4033 sql_query_chromosomes = ( 4034 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4035 ) 4036 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4037 if not sql_query_chromosomes_df["count"][0]: 4038 log.info(f"VCF empty") 4039 return 4040 4041 # Export in VCF 4042 log.debug("Create initial file to annotate") 4043 tmp_vcf = NamedTemporaryFile( 4044 prefix=self.get_prefix(), 4045 dir=self.get_tmp_dir(), 4046 suffix=".vcf.gz", 4047 delete=False, 4048 ) 4049 tmp_vcf_name = tmp_vcf.name 4050 4051 # VCF header 4052 vcf_reader = self.get_header() 4053 log.debug("Initial header: " + str(vcf_reader.infos)) 4054 4055 # Existing annotations 4056 for vcf_annotation in self.get_header().infos: 4057 4058 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4059 log.debug( 4060 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4061 ) 4062 4063 if annotations: 4064 4065 tmp_ann_vcf_list = [] 4066 commands = [] 4067 tmp_files = [] 4068 err_files = [] 4069 4070 for annotation in annotations: 4071 annotation_fields = annotations[annotation] 4072 4073 # Annotation Name 4074 annotation_name = os.path.basename(annotation) 4075 4076 if not annotation_fields: 4077 annotation_fields = {"INFO": None} 4078 4079 log.debug(f"Annotation '{annotation_name}'") 4080 log.debug( 4081 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4082 ) 4083 4084 # Create Database 4085 database = Database( 4086 database=annotation, 4087 databases_folders=databases_folders, 4088 assembly=assembly, 4089 ) 4090 4091 # Find files 4092 db_file = database.get_database() 4093 db_file = full_path(db_file) 4094 db_hdr_file = database.get_header_file() 4095 db_hdr_file = full_path(db_hdr_file) 4096 db_file_type = database.get_format() 4097 db_tbi_file = f"{db_file}.tbi" 4098 db_file_compressed = database.is_compressed() 4099 4100 # Check if compressed 4101 if not db_file_compressed: 4102 log.error( 4103 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4104 ) 4105 raise ValueError( 4106 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4107 ) 4108 4109 # Check if indexed 4110 if not os.path.exists(db_tbi_file): 4111 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4112 raise ValueError( 4113 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4114 ) 4115 4116 # Check index - try to create if not exists 4117 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4118 log.error("Annotation failed: database not valid") 4119 log.error(f"Annotation annotation file: {db_file}") 4120 log.error(f"Annotation annotation header: {db_hdr_file}") 4121 log.error(f"Annotation annotation index: {db_tbi_file}") 4122 raise ValueError( 4123 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4124 ) 4125 else: 4126 4127 log.debug( 4128 f"Annotation '{annotation}' - file: " 4129 + str(db_file) 4130 + " and " 4131 + str(db_hdr_file) 4132 ) 4133 4134 # Load header as VCF object 4135 db_hdr_vcf = Variants(input=db_hdr_file) 4136 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4137 log.debug( 4138 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4139 ) 4140 4141 # For all fields in database 4142 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4143 annotation_fields = { 4144 key: key for key in db_hdr_vcf_header_infos 4145 } 4146 log.debug( 4147 "Annotation database header - All annotations added: " 4148 + str(annotation_fields) 4149 ) 4150 4151 # Number of fields 4152 nb_annotation_field = 0 4153 annotation_list = [] 4154 4155 for annotation_field in annotation_fields: 4156 4157 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4158 annotation_fields_new_name = annotation_fields.get( 4159 annotation_field, annotation_field 4160 ) 4161 if not annotation_fields_new_name: 4162 annotation_fields_new_name = annotation_field 4163 4164 # Check if field is in DB and if field is not elready in input data 4165 if ( 4166 annotation_field in db_hdr_vcf.get_header().infos 4167 and annotation_fields_new_name 4168 not in self.get_header().infos 4169 ): 4170 4171 log.info( 4172 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4173 ) 4174 4175 # Add INFO field to header 4176 db_hdr_vcf_header_infos_number = ( 4177 db_hdr_vcf_header_infos[annotation_field].num or "." 4178 ) 4179 db_hdr_vcf_header_infos_type = ( 4180 db_hdr_vcf_header_infos[annotation_field].type 4181 or "String" 4182 ) 4183 db_hdr_vcf_header_infos_description = ( 4184 db_hdr_vcf_header_infos[annotation_field].desc 4185 or f"{annotation_field} description" 4186 ) 4187 db_hdr_vcf_header_infos_source = ( 4188 db_hdr_vcf_header_infos[annotation_field].source 4189 or "unknown" 4190 ) 4191 db_hdr_vcf_header_infos_version = ( 4192 db_hdr_vcf_header_infos[annotation_field].version 4193 or "unknown" 4194 ) 4195 4196 vcf_reader.infos[annotation_fields_new_name] = ( 4197 vcf.parser._Info( 4198 annotation_fields_new_name, 4199 db_hdr_vcf_header_infos_number, 4200 db_hdr_vcf_header_infos_type, 4201 db_hdr_vcf_header_infos_description, 4202 db_hdr_vcf_header_infos_source, 4203 db_hdr_vcf_header_infos_version, 4204 self.code_type_map[db_hdr_vcf_header_infos_type], 4205 ) 4206 ) 4207 4208 # annotation_list.append(annotation_field) 4209 if annotation_field != annotation_fields_new_name: 4210 annotation_list.append( 4211 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4212 ) 4213 else: 4214 annotation_list.append(annotation_field) 4215 4216 nb_annotation_field += 1 4217 4218 else: 4219 4220 if annotation_field not in db_hdr_vcf.get_header().infos: 4221 log.warning( 4222 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4223 ) 4224 if annotation_fields_new_name in self.get_header().infos: 4225 log.warning( 4226 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4227 ) 4228 4229 log.info( 4230 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4231 ) 4232 4233 annotation_infos = ",".join(annotation_list) 4234 4235 if annotation_infos != "": 4236 4237 # Protect header for bcftools (remove "#CHROM" and variants line) 4238 log.debug("Protect Header file - remove #CHROM line if exists") 4239 tmp_header_vcf = NamedTemporaryFile( 4240 prefix=self.get_prefix(), 4241 dir=self.get_tmp_dir(), 4242 suffix=".hdr", 4243 delete=False, 4244 ) 4245 tmp_header_vcf_name = tmp_header_vcf.name 4246 tmp_files.append(tmp_header_vcf_name) 4247 # Command 4248 if db_hdr_file.endswith(".gz"): 4249 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4250 else: 4251 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4252 # Run 4253 run_parallel_commands([command_extract_header], 1) 4254 4255 # Find chomosomes 4256 log.debug("Find chromosomes ") 4257 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4258 sql_query_chromosomes_df = self.get_query_to_df( 4259 sql_query_chromosomes 4260 ) 4261 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4262 4263 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4264 4265 # BED columns in the annotation file 4266 if db_file_type in ["bed"]: 4267 annotation_infos = "CHROM,POS,POS," + annotation_infos 4268 4269 for chrom in chomosomes_list: 4270 4271 # Create BED on initial VCF 4272 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4273 tmp_bed = NamedTemporaryFile( 4274 prefix=self.get_prefix(), 4275 dir=self.get_tmp_dir(), 4276 suffix=".bed", 4277 delete=False, 4278 ) 4279 tmp_bed_name = tmp_bed.name 4280 tmp_files.append(tmp_bed_name) 4281 4282 # Detecte regions 4283 log.debug( 4284 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4285 ) 4286 window = 1000000 4287 sql_query_intervals_for_bed = f""" 4288 SELECT \"#CHROM\", 4289 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4290 \"POS\"+{window} 4291 FROM {table_variants} as table_variants 4292 WHERE table_variants.\"#CHROM\" = '{chrom}' 4293 """ 4294 regions = self.conn.execute( 4295 sql_query_intervals_for_bed 4296 ).fetchall() 4297 merged_regions = merge_regions(regions) 4298 log.debug( 4299 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4300 ) 4301 4302 header = ["#CHROM", "START", "END"] 4303 with open(tmp_bed_name, "w") as f: 4304 # Write the header with tab delimiter 4305 f.write("\t".join(header) + "\n") 4306 for d in merged_regions: 4307 # Write each data row with tab delimiter 4308 f.write("\t".join(map(str, d)) + "\n") 4309 4310 # Tmp files 4311 tmp_annotation_vcf = NamedTemporaryFile( 4312 prefix=self.get_prefix(), 4313 dir=self.get_tmp_dir(), 4314 suffix=".vcf.gz", 4315 delete=False, 4316 ) 4317 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4318 tmp_files.append(tmp_annotation_vcf_name) 4319 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4320 tmp_annotation_vcf_name_err = ( 4321 tmp_annotation_vcf_name + ".err" 4322 ) 4323 err_files.append(tmp_annotation_vcf_name_err) 4324 4325 # Annotate Command 4326 log.debug( 4327 f"Annotation '{annotation}' - add bcftools command" 4328 ) 4329 4330 # Command 4331 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4332 4333 # Add command 4334 commands.append(command_annotate) 4335 4336 # if some commands 4337 if commands: 4338 4339 # Export VCF file 4340 self.export_variant_vcf( 4341 vcf_file=tmp_vcf_name, 4342 remove_info=True, 4343 add_samples=False, 4344 index=True, 4345 ) 4346 4347 # Threads 4348 # calculate threads for annotated commands 4349 if commands: 4350 threads_bcftools_annotate = round(threads / len(commands)) 4351 else: 4352 threads_bcftools_annotate = 1 4353 4354 if not threads_bcftools_annotate: 4355 threads_bcftools_annotate = 1 4356 4357 # Add threads option to bcftools commands 4358 if threads_bcftools_annotate > 1: 4359 commands_threaded = [] 4360 for command in commands: 4361 commands_threaded.append( 4362 command.replace( 4363 f"{bcftools_bin_command} annotate ", 4364 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4365 ) 4366 ) 4367 commands = commands_threaded 4368 4369 # Command annotation multithreading 4370 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4371 log.info( 4372 f"Annotation - Annotation multithreaded in " 4373 + str(len(commands)) 4374 + " commands" 4375 ) 4376 4377 run_parallel_commands(commands, threads) 4378 4379 # Merge 4380 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4381 4382 if tmp_ann_vcf_list_cmd: 4383 4384 # Tmp file 4385 tmp_annotate_vcf = NamedTemporaryFile( 4386 prefix=self.get_prefix(), 4387 dir=self.get_tmp_dir(), 4388 suffix=".vcf.gz", 4389 delete=True, 4390 ) 4391 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4392 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4393 err_files.append(tmp_annotate_vcf_name_err) 4394 4395 # Tmp file remove command 4396 tmp_files_remove_command = "" 4397 if tmp_files: 4398 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4399 4400 # Command merge 4401 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4402 log.info( 4403 f"Annotation - Annotation merging " 4404 + str(len(commands)) 4405 + " annotated files" 4406 ) 4407 log.debug(f"Annotation - merge command: {merge_command}") 4408 run_parallel_commands([merge_command], 1) 4409 4410 # Error messages 4411 log.info(f"Error/Warning messages:") 4412 error_message_command_all = [] 4413 error_message_command_warning = [] 4414 error_message_command_err = [] 4415 for err_file in err_files: 4416 with open(err_file, "r") as f: 4417 for line in f: 4418 message = line.strip() 4419 error_message_command_all.append(message) 4420 if line.startswith("[W::"): 4421 error_message_command_warning.append(message) 4422 if line.startswith("[E::"): 4423 error_message_command_err.append( 4424 f"{err_file}: " + message 4425 ) 4426 # log info 4427 for message in list( 4428 set(error_message_command_err + error_message_command_warning) 4429 ): 4430 log.info(f" {message}") 4431 # debug info 4432 for message in list(set(error_message_command_all)): 4433 log.debug(f" {message}") 4434 # failed 4435 if len(error_message_command_err): 4436 log.error("Annotation failed: Error in commands") 4437 raise ValueError("Annotation failed: Error in commands") 4438 4439 # Update variants 4440 log.info(f"Annotation - Updating...") 4441 self.update_from_vcf(tmp_annotate_vcf_name) 4442 4443 def annotation_exomiser(self, threads: int = None) -> None: 4444 """ 4445 This function annotate with Exomiser 4446 4447 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4448 - "analysis" (dict/file): 4449 Full analysis dictionnary parameters (see Exomiser docs). 4450 Either a dict, or a file in JSON or YAML format. 4451 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4452 Default : None 4453 - "preset" (string): 4454 Analysis preset (available in config folder). 4455 Used if no full "analysis" is provided. 4456 Default: "exome" 4457 - "phenopacket" (dict/file): 4458 Samples and phenotipic features parameters (see Exomiser docs). 4459 Either a dict, or a file in JSON or YAML format. 4460 Default: None 4461 - "subject" (dict): 4462 Sample parameters (see Exomiser docs). 4463 Example: 4464 "subject": 4465 { 4466 "id": "ISDBM322017", 4467 "sex": "FEMALE" 4468 } 4469 Default: None 4470 - "sample" (string): 4471 Sample name to construct "subject" section: 4472 "subject": 4473 { 4474 "id": "<sample>", 4475 "sex": "UNKNOWN_SEX" 4476 } 4477 Default: None 4478 - "phenotypicFeatures" (dict) 4479 Phenotypic features to construct "subject" section. 4480 Example: 4481 "phenotypicFeatures": 4482 [ 4483 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4484 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4485 ] 4486 - "hpo" (list) 4487 List of HPO ids as phenotypic features. 4488 Example: 4489 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4490 Default: [] 4491 - "outputOptions" (dict): 4492 Output options (see Exomiser docs). 4493 Default: 4494 "output_options" = 4495 { 4496 "outputContributingVariantsOnly": False, 4497 "numGenes": 0, 4498 "outputFormats": ["TSV_VARIANT", "VCF"] 4499 } 4500 - "transcript_source" (string): 4501 Transcript source (either "refseq", "ucsc", "ensembl") 4502 Default: "refseq" 4503 - "exomiser_to_info" (boolean): 4504 Add exomiser TSV file columns as INFO fields in VCF. 4505 Default: False 4506 - "release" (string): 4507 Exomise database release. 4508 If not exists, database release will be downloaded (take a while). 4509 Default: None (provided by application.properties configuration file) 4510 - "exomiser_application_properties" (file): 4511 Exomiser configuration file (see Exomiser docs). 4512 Useful to automatically download databases (especially for specific genome databases). 4513 4514 Notes: 4515 - If no sample in parameters, first sample in VCF will be chosen 4516 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4517 4518 :param threads: The number of threads to use 4519 :return: None. 4520 """ 4521 4522 # DEBUG 4523 log.debug("Start annotation with Exomiser databases") 4524 4525 # Threads 4526 if not threads: 4527 threads = self.get_threads() 4528 log.debug("Threads: " + str(threads)) 4529 4530 # Config 4531 config = self.get_config() 4532 log.debug("Config: " + str(config)) 4533 4534 # Config - Folders - Databases 4535 databases_folders = ( 4536 config.get("folders", {}) 4537 .get("databases", {}) 4538 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4539 ) 4540 databases_folders = full_path(databases_folders) 4541 if not os.path.exists(databases_folders): 4542 log.error(f"Databases annotations: {databases_folders} NOT found") 4543 log.debug("Databases annotations: " + str(databases_folders)) 4544 4545 # Config - Exomiser 4546 exomiser_bin_command = get_bin_command( 4547 bin="exomiser-cli*.jar", 4548 tool="exomiser", 4549 bin_type="jar", 4550 config=config, 4551 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4552 ) 4553 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4554 if not exomiser_bin_command: 4555 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4556 log.error(msg_err) 4557 raise ValueError(msg_err) 4558 4559 # Param 4560 param = self.get_param() 4561 log.debug("Param: " + str(param)) 4562 4563 # Param - Exomiser 4564 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4565 log.debug(f"Param Exomiser: {param_exomiser}") 4566 4567 # Param - Assembly 4568 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4569 log.debug("Assembly: " + str(assembly)) 4570 4571 # Data 4572 table_variants = self.get_table_variants() 4573 4574 # Check if not empty 4575 log.debug("Check if not empty") 4576 sql_query_chromosomes = ( 4577 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4578 ) 4579 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4580 log.info(f"VCF empty") 4581 return False 4582 4583 # VCF header 4584 vcf_reader = self.get_header() 4585 log.debug("Initial header: " + str(vcf_reader.infos)) 4586 4587 # Samples 4588 samples = self.get_header_sample_list() 4589 if not samples: 4590 log.error("No Samples in VCF") 4591 return False 4592 log.debug(f"Samples: {samples}") 4593 4594 # Memory limit 4595 memory_limit = self.get_memory("8G") 4596 log.debug(f"memory_limit: {memory_limit}") 4597 4598 # Exomiser java options 4599 exomiser_java_options = ( 4600 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4601 ) 4602 log.debug(f"Exomiser java options: {exomiser_java_options}") 4603 4604 # Download Exomiser (if not exists) 4605 exomiser_release = param_exomiser.get("release", None) 4606 exomiser_application_properties = param_exomiser.get( 4607 "exomiser_application_properties", None 4608 ) 4609 databases_download_exomiser( 4610 assemblies=[assembly], 4611 exomiser_folder=databases_folders, 4612 exomiser_release=exomiser_release, 4613 exomiser_phenotype_release=exomiser_release, 4614 exomiser_application_properties=exomiser_application_properties, 4615 ) 4616 4617 # Force annotation 4618 force_update_annotation = True 4619 4620 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4621 log.debug("Start annotation Exomiser") 4622 4623 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4624 4625 # tmp_dir = "/tmp/exomiser" 4626 4627 ### ANALYSIS ### 4628 ################ 4629 4630 # Create analysis.json through analysis dict 4631 # either analysis in param or by default 4632 # depending on preset exome/genome) 4633 4634 # Init analysis dict 4635 param_exomiser_analysis_dict = {} 4636 4637 # analysis from param 4638 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4639 param_exomiser_analysis = full_path(param_exomiser_analysis) 4640 4641 # If analysis in param -> load anlaysis json 4642 if param_exomiser_analysis: 4643 4644 # If param analysis is a file and exists 4645 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4646 param_exomiser_analysis 4647 ): 4648 # Load analysis file into analysis dict (either yaml or json) 4649 with open(param_exomiser_analysis) as json_file: 4650 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4651 4652 # If param analysis is a dict 4653 elif isinstance(param_exomiser_analysis, dict): 4654 # Load analysis dict into analysis dict (either yaml or json) 4655 param_exomiser_analysis_dict = param_exomiser_analysis 4656 4657 # Error analysis type 4658 else: 4659 log.error(f"Analysis type unknown. Check param file.") 4660 raise ValueError(f"Analysis type unknown. Check param file.") 4661 4662 # Case no input analysis config file/dict 4663 # Use preset (exome/genome) to open default config file 4664 if not param_exomiser_analysis_dict: 4665 4666 # default preset 4667 default_preset = "exome" 4668 4669 # Get param preset or default preset 4670 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4671 4672 # Try to find if preset is a file 4673 if os.path.exists(param_exomiser_preset): 4674 # Preset file is provided in full path 4675 param_exomiser_analysis_default_config_file = ( 4676 param_exomiser_preset 4677 ) 4678 # elif os.path.exists(full_path(param_exomiser_preset)): 4679 # # Preset file is provided in full path 4680 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4681 elif os.path.exists( 4682 os.path.join(folder_config, param_exomiser_preset) 4683 ): 4684 # Preset file is provided a basename in config folder (can be a path with subfolders) 4685 param_exomiser_analysis_default_config_file = os.path.join( 4686 folder_config, param_exomiser_preset 4687 ) 4688 else: 4689 # Construct preset file 4690 param_exomiser_analysis_default_config_file = os.path.join( 4691 folder_config, 4692 f"preset-{param_exomiser_preset}-analysis.json", 4693 ) 4694 4695 # If preset file exists 4696 param_exomiser_analysis_default_config_file = full_path( 4697 param_exomiser_analysis_default_config_file 4698 ) 4699 if os.path.exists(param_exomiser_analysis_default_config_file): 4700 # Load prest file into analysis dict (either yaml or json) 4701 with open( 4702 param_exomiser_analysis_default_config_file 4703 ) as json_file: 4704 # param_exomiser_analysis_dict[""] = json.load(json_file) 4705 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4706 json_file 4707 ) 4708 4709 # Error preset file 4710 else: 4711 log.error( 4712 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4713 ) 4714 raise ValueError( 4715 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4716 ) 4717 4718 # If no analysis dict created 4719 if not param_exomiser_analysis_dict: 4720 log.error(f"No analysis config") 4721 raise ValueError(f"No analysis config") 4722 4723 # Log 4724 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4725 4726 ### PHENOPACKET ### 4727 ################### 4728 4729 # If no PhenoPacket in analysis dict -> check in param 4730 if "phenopacket" not in param_exomiser_analysis_dict: 4731 4732 # If PhenoPacket in param -> load anlaysis json 4733 if param_exomiser.get("phenopacket", None): 4734 4735 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4736 param_exomiser_phenopacket = full_path( 4737 param_exomiser_phenopacket 4738 ) 4739 4740 # If param phenopacket is a file and exists 4741 if isinstance( 4742 param_exomiser_phenopacket, str 4743 ) and os.path.exists(param_exomiser_phenopacket): 4744 # Load phenopacket file into analysis dict (either yaml or json) 4745 with open(param_exomiser_phenopacket) as json_file: 4746 param_exomiser_analysis_dict["phenopacket"] = ( 4747 yaml.safe_load(json_file) 4748 ) 4749 4750 # If param phenopacket is a dict 4751 elif isinstance(param_exomiser_phenopacket, dict): 4752 # Load phenopacket dict into analysis dict (either yaml or json) 4753 param_exomiser_analysis_dict["phenopacket"] = ( 4754 param_exomiser_phenopacket 4755 ) 4756 4757 # Error phenopacket type 4758 else: 4759 log.error(f"Phenopacket type unknown. Check param file.") 4760 raise ValueError( 4761 f"Phenopacket type unknown. Check param file." 4762 ) 4763 4764 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4765 if "phenopacket" not in param_exomiser_analysis_dict: 4766 4767 # Init PhenoPacket 4768 param_exomiser_analysis_dict["phenopacket"] = { 4769 "id": "analysis", 4770 "proband": {}, 4771 } 4772 4773 ### Add subject ### 4774 4775 # If subject exists 4776 param_exomiser_subject = param_exomiser.get("subject", {}) 4777 4778 # If subject not exists -> found sample ID 4779 if not param_exomiser_subject: 4780 4781 # Found sample ID in param 4782 sample = param_exomiser.get("sample", None) 4783 4784 # Find sample ID (first sample) 4785 if not sample: 4786 sample_list = self.get_header_sample_list() 4787 if len(sample_list) > 0: 4788 sample = sample_list[0] 4789 else: 4790 log.error(f"No sample found") 4791 raise ValueError(f"No sample found") 4792 4793 # Create subject 4794 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4795 4796 # Add to dict 4797 param_exomiser_analysis_dict["phenopacket"][ 4798 "subject" 4799 ] = param_exomiser_subject 4800 4801 ### Add "phenotypicFeatures" ### 4802 4803 # If phenotypicFeatures exists 4804 param_exomiser_phenotypicfeatures = param_exomiser.get( 4805 "phenotypicFeatures", [] 4806 ) 4807 4808 # If phenotypicFeatures not exists -> Try to infer from hpo list 4809 if not param_exomiser_phenotypicfeatures: 4810 4811 # Found HPO in param 4812 param_exomiser_hpo = param_exomiser.get("hpo", []) 4813 4814 # Split HPO if list in string format separated by comma 4815 if isinstance(param_exomiser_hpo, str): 4816 param_exomiser_hpo = param_exomiser_hpo.split(",") 4817 4818 # Create HPO list 4819 for hpo in param_exomiser_hpo: 4820 hpo_clean = re.sub("[^0-9]", "", hpo) 4821 param_exomiser_phenotypicfeatures.append( 4822 { 4823 "type": { 4824 "id": f"HP:{hpo_clean}", 4825 "label": f"HP:{hpo_clean}", 4826 } 4827 } 4828 ) 4829 4830 # Add to dict 4831 param_exomiser_analysis_dict["phenopacket"][ 4832 "phenotypicFeatures" 4833 ] = param_exomiser_phenotypicfeatures 4834 4835 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4836 if not param_exomiser_phenotypicfeatures: 4837 for step in param_exomiser_analysis_dict.get( 4838 "analysis", {} 4839 ).get("steps", []): 4840 if "hiPhivePrioritiser" in step: 4841 param_exomiser_analysis_dict.get("analysis", {}).get( 4842 "steps", [] 4843 ).remove(step) 4844 4845 ### Add Input File ### 4846 4847 # Initial file name and htsFiles 4848 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4849 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4850 { 4851 "uri": tmp_vcf_name, 4852 "htsFormat": "VCF", 4853 "genomeAssembly": assembly, 4854 } 4855 ] 4856 4857 ### Add metaData ### 4858 4859 # If metaData not in analysis dict 4860 if "metaData" not in param_exomiser_analysis_dict: 4861 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4862 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4863 "createdBy": "howard", 4864 "phenopacketSchemaVersion": 1, 4865 } 4866 4867 ### OutputOptions ### 4868 4869 # Init output result folder 4870 output_results = os.path.join(tmp_dir, "results") 4871 4872 # If no outputOptions in analysis dict 4873 if "outputOptions" not in param_exomiser_analysis_dict: 4874 4875 # default output formats 4876 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4877 4878 # Get outputOptions in param 4879 output_options = param_exomiser.get("outputOptions", None) 4880 4881 # If no output_options in param -> check 4882 if not output_options: 4883 output_options = { 4884 "outputContributingVariantsOnly": False, 4885 "numGenes": 0, 4886 "outputFormats": defaut_output_formats, 4887 } 4888 4889 # Replace outputDirectory in output options 4890 output_options["outputDirectory"] = output_results 4891 output_options["outputFileName"] = "howard" 4892 4893 # Add outputOptions in analysis dict 4894 param_exomiser_analysis_dict["outputOptions"] = output_options 4895 4896 else: 4897 4898 # Replace output_results and output format (if exists in param) 4899 param_exomiser_analysis_dict["outputOptions"][ 4900 "outputDirectory" 4901 ] = output_results 4902 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4903 list( 4904 set( 4905 param_exomiser_analysis_dict.get( 4906 "outputOptions", {} 4907 ).get("outputFormats", []) 4908 + ["TSV_VARIANT", "VCF"] 4909 ) 4910 ) 4911 ) 4912 4913 # log 4914 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4915 4916 ### ANALYSIS FILE ### 4917 ##################### 4918 4919 ### Full JSON analysis config file ### 4920 4921 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4922 with open(exomiser_analysis, "w") as fp: 4923 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4924 4925 ### SPLIT analysis and sample config files 4926 4927 # Splitted analysis dict 4928 param_exomiser_analysis_dict_for_split = ( 4929 param_exomiser_analysis_dict.copy() 4930 ) 4931 4932 # Phenopacket JSON file 4933 exomiser_analysis_phenopacket = os.path.join( 4934 tmp_dir, "analysis_phenopacket.json" 4935 ) 4936 with open(exomiser_analysis_phenopacket, "w") as fp: 4937 json.dump( 4938 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4939 fp, 4940 indent=4, 4941 ) 4942 4943 # Analysis JSON file without Phenopacket parameters 4944 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4945 exomiser_analysis_analysis = os.path.join( 4946 tmp_dir, "analysis_analysis.json" 4947 ) 4948 with open(exomiser_analysis_analysis, "w") as fp: 4949 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4950 4951 ### INITAL VCF file ### 4952 ####################### 4953 4954 ### Create list of samples to use and include inti initial VCF file #### 4955 4956 # Subject (main sample) 4957 # Get sample ID in analysis dict 4958 sample_subject = ( 4959 param_exomiser_analysis_dict.get("phenopacket", {}) 4960 .get("subject", {}) 4961 .get("id", None) 4962 ) 4963 sample_proband = ( 4964 param_exomiser_analysis_dict.get("phenopacket", {}) 4965 .get("proband", {}) 4966 .get("subject", {}) 4967 .get("id", None) 4968 ) 4969 sample = [] 4970 if sample_subject: 4971 sample.append(sample_subject) 4972 if sample_proband: 4973 sample.append(sample_proband) 4974 4975 # Get sample ID within Pedigree 4976 pedigree_persons_list = ( 4977 param_exomiser_analysis_dict.get("phenopacket", {}) 4978 .get("pedigree", {}) 4979 .get("persons", {}) 4980 ) 4981 4982 # Create list with all sample ID in pedigree (if exists) 4983 pedigree_persons = [] 4984 for person in pedigree_persons_list: 4985 pedigree_persons.append(person.get("individualId")) 4986 4987 # Concat subject sample ID and samples ID in pedigreesamples 4988 samples = list(set(sample + pedigree_persons)) 4989 4990 # Check if sample list is not empty 4991 if not samples: 4992 log.error(f"No samples found") 4993 raise ValueError(f"No samples found") 4994 4995 # Create VCF with sample (either sample in param or first one by default) 4996 # Export VCF file 4997 self.export_variant_vcf( 4998 vcf_file=tmp_vcf_name, 4999 remove_info=True, 5000 add_samples=True, 5001 list_samples=samples, 5002 index=False, 5003 ) 5004 5005 ### Execute Exomiser ### 5006 ######################## 5007 5008 # Init command 5009 exomiser_command = "" 5010 5011 # Command exomiser options 5012 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5013 5014 # Release 5015 exomiser_release = param_exomiser.get("release", None) 5016 if exomiser_release: 5017 # phenotype data version 5018 exomiser_options += ( 5019 f" --exomiser.phenotype.data-version={exomiser_release} " 5020 ) 5021 # data version 5022 exomiser_options += ( 5023 f" --exomiser.{assembly}.data-version={exomiser_release} " 5024 ) 5025 # variant white list 5026 variant_white_list_file = ( 5027 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5028 ) 5029 if os.path.exists( 5030 os.path.join( 5031 databases_folders, assembly, variant_white_list_file 5032 ) 5033 ): 5034 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5035 5036 # transcript_source 5037 transcript_source = param_exomiser.get( 5038 "transcript_source", None 5039 ) # ucsc, refseq, ensembl 5040 if transcript_source: 5041 exomiser_options += ( 5042 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5043 ) 5044 5045 # If analysis contain proband param 5046 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5047 "proband", {} 5048 ): 5049 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5050 5051 # If no proband (usually uniq sample) 5052 else: 5053 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5054 5055 # Log 5056 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5057 5058 # Run command 5059 result = subprocess.call( 5060 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5061 ) 5062 if result: 5063 log.error("Exomiser command failed") 5064 raise ValueError("Exomiser command failed") 5065 5066 ### RESULTS ### 5067 ############### 5068 5069 ### Annotate with TSV fields ### 5070 5071 # Init result tsv file 5072 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5073 5074 # Init result tsv file 5075 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5076 5077 # Parse TSV file and explode columns in INFO field 5078 if exomiser_to_info and os.path.exists(output_results_tsv): 5079 5080 # Log 5081 log.debug("Exomiser columns to VCF INFO field") 5082 5083 # Retrieve columns and types 5084 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5085 output_results_tsv_df = self.get_query_to_df(query) 5086 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5087 5088 # Init concat fields for update 5089 sql_query_update_concat_fields = [] 5090 5091 # Fields to avoid 5092 fields_to_avoid = [ 5093 "CONTIG", 5094 "START", 5095 "END", 5096 "REF", 5097 "ALT", 5098 "QUAL", 5099 "FILTER", 5100 "GENOTYPE", 5101 ] 5102 5103 # List all columns to add into header 5104 for header_column in output_results_tsv_columns: 5105 5106 # If header column is enable 5107 if header_column not in fields_to_avoid: 5108 5109 # Header info type 5110 header_info_type = "String" 5111 header_column_df = output_results_tsv_df[header_column] 5112 header_column_df_dtype = header_column_df.dtype 5113 if header_column_df_dtype == object: 5114 if ( 5115 pd.to_numeric(header_column_df, errors="coerce") 5116 .notnull() 5117 .all() 5118 ): 5119 header_info_type = "Float" 5120 else: 5121 header_info_type = "Integer" 5122 5123 # Header info 5124 characters_to_validate = ["-"] 5125 pattern = "[" + "".join(characters_to_validate) + "]" 5126 header_info_name = re.sub( 5127 pattern, 5128 "_", 5129 f"Exomiser_{header_column}".replace("#", ""), 5130 ) 5131 header_info_number = "." 5132 header_info_description = ( 5133 f"Exomiser {header_column} annotation" 5134 ) 5135 header_info_source = "Exomiser" 5136 header_info_version = "unknown" 5137 header_info_code = CODE_TYPE_MAP[header_info_type] 5138 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5139 header_info_name, 5140 header_info_number, 5141 header_info_type, 5142 header_info_description, 5143 header_info_source, 5144 header_info_version, 5145 header_info_code, 5146 ) 5147 5148 # Add field to add for update to concat fields 5149 sql_query_update_concat_fields.append( 5150 f""" 5151 CASE 5152 WHEN table_parquet."{header_column}" NOT IN ('','.') 5153 THEN concat( 5154 '{header_info_name}=', 5155 table_parquet."{header_column}", 5156 ';' 5157 ) 5158 5159 ELSE '' 5160 END 5161 """ 5162 ) 5163 5164 # Update query 5165 sql_query_update = f""" 5166 UPDATE {table_variants} as table_variants 5167 SET INFO = concat( 5168 CASE 5169 WHEN INFO NOT IN ('', '.') 5170 THEN INFO 5171 ELSE '' 5172 END, 5173 CASE 5174 WHEN table_variants.INFO NOT IN ('','.') 5175 THEN ';' 5176 ELSE '' 5177 END, 5178 ( 5179 SELECT 5180 concat( 5181 {",".join(sql_query_update_concat_fields)} 5182 ) 5183 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5184 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5185 AND table_parquet.\"START\" = table_variants.\"POS\" 5186 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5187 AND table_parquet.\"REF\" = table_variants.\"REF\" 5188 ) 5189 ) 5190 ; 5191 """ 5192 5193 # Update 5194 self.conn.execute(sql_query_update) 5195 5196 ### Annotate with VCF INFO field ### 5197 5198 # Init result VCF file 5199 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5200 5201 # If VCF exists 5202 if os.path.exists(output_results_vcf): 5203 5204 # Log 5205 log.debug("Exomiser result VCF update variants") 5206 5207 # Find Exomiser INFO field annotation in header 5208 with gzip.open(output_results_vcf, "rt") as f: 5209 header_list = self.read_vcf_header(f) 5210 exomiser_vcf_header = vcf.Reader( 5211 io.StringIO("\n".join(header_list)) 5212 ) 5213 5214 # Add annotation INFO field to header 5215 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5216 5217 # Update variants with VCF 5218 self.update_from_vcf(output_results_vcf) 5219 5220 return True 5221 5222 def annotation_snpeff(self, threads: int = None) -> None: 5223 """ 5224 This function annotate with snpEff 5225 5226 :param threads: The number of threads to use 5227 :return: the value of the variable "return_value". 5228 """ 5229 5230 # DEBUG 5231 log.debug("Start annotation with snpeff databases") 5232 5233 # Threads 5234 if not threads: 5235 threads = self.get_threads() 5236 log.debug("Threads: " + str(threads)) 5237 5238 # DEBUG 5239 delete_tmp = True 5240 if self.get_config().get("verbosity", "warning") in ["debug"]: 5241 delete_tmp = False 5242 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5243 5244 # Config 5245 config = self.get_config() 5246 log.debug("Config: " + str(config)) 5247 5248 # Config - Folders - Databases 5249 databases_folders = ( 5250 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5251 ) 5252 log.debug("Databases annotations: " + str(databases_folders)) 5253 5254 # Config - snpEff bin command 5255 snpeff_bin_command = get_bin_command( 5256 bin="snpEff.jar", 5257 tool="snpeff", 5258 bin_type="jar", 5259 config=config, 5260 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5261 ) 5262 if not snpeff_bin_command: 5263 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5264 log.error(msg_err) 5265 raise ValueError(msg_err) 5266 5267 # Config - snpEff databases 5268 snpeff_databases = ( 5269 config.get("folders", {}) 5270 .get("databases", {}) 5271 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5272 ) 5273 snpeff_databases = full_path(snpeff_databases) 5274 if snpeff_databases is not None and snpeff_databases != "": 5275 log.debug(f"Create snpEff databases folder") 5276 if not os.path.exists(snpeff_databases): 5277 os.makedirs(snpeff_databases) 5278 5279 # Param 5280 param = self.get_param() 5281 log.debug("Param: " + str(param)) 5282 5283 # Param 5284 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5285 log.debug("Options: " + str(options)) 5286 5287 # Param - Assembly 5288 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5289 5290 # Param - Options 5291 snpeff_options = ( 5292 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5293 ) 5294 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5295 snpeff_csvstats = ( 5296 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5297 ) 5298 if snpeff_stats: 5299 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5300 snpeff_stats = full_path(snpeff_stats) 5301 snpeff_options += f" -stats {snpeff_stats}" 5302 if snpeff_csvstats: 5303 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5304 snpeff_csvstats = full_path(snpeff_csvstats) 5305 snpeff_options += f" -csvStats {snpeff_csvstats}" 5306 5307 # Data 5308 table_variants = self.get_table_variants() 5309 5310 # Check if not empty 5311 log.debug("Check if not empty") 5312 sql_query_chromosomes = ( 5313 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5314 ) 5315 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5316 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5317 log.info(f"VCF empty") 5318 return 5319 5320 # Export in VCF 5321 log.debug("Create initial file to annotate") 5322 tmp_vcf = NamedTemporaryFile( 5323 prefix=self.get_prefix(), 5324 dir=self.get_tmp_dir(), 5325 suffix=".vcf.gz", 5326 delete=True, 5327 ) 5328 tmp_vcf_name = tmp_vcf.name 5329 5330 # VCF header 5331 vcf_reader = self.get_header() 5332 log.debug("Initial header: " + str(vcf_reader.infos)) 5333 5334 # Existing annotations 5335 for vcf_annotation in self.get_header().infos: 5336 5337 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5338 log.debug( 5339 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5340 ) 5341 5342 # Memory limit 5343 # if config.get("memory", None): 5344 # memory_limit = config.get("memory", "8G") 5345 # else: 5346 # memory_limit = "8G" 5347 memory_limit = self.get_memory("8G") 5348 log.debug(f"memory_limit: {memory_limit}") 5349 5350 # snpEff java options 5351 snpeff_java_options = ( 5352 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5353 ) 5354 log.debug(f"Exomiser java options: {snpeff_java_options}") 5355 5356 force_update_annotation = True 5357 5358 if "ANN" not in self.get_header().infos or force_update_annotation: 5359 5360 # Check snpEff database 5361 log.debug(f"Check snpEff databases {[assembly]}") 5362 databases_download_snpeff( 5363 folder=snpeff_databases, assemblies=[assembly], config=config 5364 ) 5365 5366 # Export VCF file 5367 self.export_variant_vcf( 5368 vcf_file=tmp_vcf_name, 5369 remove_info=True, 5370 add_samples=False, 5371 index=True, 5372 ) 5373 5374 # Tmp file 5375 err_files = [] 5376 tmp_annotate_vcf = NamedTemporaryFile( 5377 prefix=self.get_prefix(), 5378 dir=self.get_tmp_dir(), 5379 suffix=".vcf", 5380 delete=False, 5381 ) 5382 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5383 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5384 err_files.append(tmp_annotate_vcf_name_err) 5385 5386 # Command 5387 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5388 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5389 run_parallel_commands([snpeff_command], 1) 5390 5391 # Error messages 5392 log.info(f"Error/Warning messages:") 5393 error_message_command_all = [] 5394 error_message_command_warning = [] 5395 error_message_command_err = [] 5396 for err_file in err_files: 5397 with open(err_file, "r") as f: 5398 for line in f: 5399 message = line.strip() 5400 error_message_command_all.append(message) 5401 if line.startswith("[W::"): 5402 error_message_command_warning.append(message) 5403 if line.startswith("[E::"): 5404 error_message_command_err.append(f"{err_file}: " + message) 5405 # log info 5406 for message in list( 5407 set(error_message_command_err + error_message_command_warning) 5408 ): 5409 log.info(f" {message}") 5410 # debug info 5411 for message in list(set(error_message_command_all)): 5412 log.debug(f" {message}") 5413 # failed 5414 if len(error_message_command_err): 5415 log.error("Annotation failed: Error in commands") 5416 raise ValueError("Annotation failed: Error in commands") 5417 5418 # Find annotation in header 5419 with open(tmp_annotate_vcf_name, "rt") as f: 5420 header_list = self.read_vcf_header(f) 5421 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5422 5423 for ann in annovar_vcf_header.infos: 5424 if ann not in self.get_header().infos: 5425 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5426 5427 # Update variants 5428 log.info(f"Annotation - Updating...") 5429 self.update_from_vcf(tmp_annotate_vcf_name) 5430 5431 else: 5432 if "ANN" in self.get_header().infos: 5433 log.debug(f"Existing snpEff annotations in VCF") 5434 if force_update_annotation: 5435 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5436 5437 def annotation_annovar(self, threads: int = None) -> None: 5438 """ 5439 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5440 annotations 5441 5442 :param threads: number of threads to use 5443 :return: the value of the variable "return_value". 5444 """ 5445 5446 # DEBUG 5447 log.debug("Start annotation with Annovar databases") 5448 5449 # Threads 5450 if not threads: 5451 threads = self.get_threads() 5452 log.debug("Threads: " + str(threads)) 5453 5454 # Tmp en Err files 5455 tmp_files = [] 5456 err_files = [] 5457 5458 # DEBUG 5459 delete_tmp = True 5460 if self.get_config().get("verbosity", "warning") in ["debug"]: 5461 delete_tmp = False 5462 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5463 5464 # Config 5465 config = self.get_config() 5466 log.debug("Config: " + str(config)) 5467 5468 # Config - Folders - Databases 5469 databases_folders = ( 5470 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5471 ) 5472 log.debug("Databases annotations: " + str(databases_folders)) 5473 5474 # Config - annovar bin command 5475 annovar_bin_command = get_bin_command( 5476 bin="table_annovar.pl", 5477 tool="annovar", 5478 bin_type="perl", 5479 config=config, 5480 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5481 ) 5482 if not annovar_bin_command: 5483 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5484 log.error(msg_err) 5485 raise ValueError(msg_err) 5486 5487 # Config - BCFTools bin command 5488 bcftools_bin_command = get_bin_command( 5489 bin="bcftools", 5490 tool="bcftools", 5491 bin_type="bin", 5492 config=config, 5493 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5494 ) 5495 if not bcftools_bin_command: 5496 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5497 log.error(msg_err) 5498 raise ValueError(msg_err) 5499 5500 # Config - annovar databases 5501 annovar_databases = ( 5502 config.get("folders", {}) 5503 .get("databases", {}) 5504 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5505 ) 5506 if annovar_databases is not None: 5507 if isinstance(annovar_databases, list): 5508 annovar_databases = full_path(annovar_databases[0]) 5509 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5510 annovar_databases = full_path(annovar_databases) 5511 if not os.path.exists(annovar_databases): 5512 log.info(f"Annovar databases folder '{annovar_databases}' created") 5513 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5514 else: 5515 msg_err = f"Annovar databases configuration failed" 5516 log.error(msg_err) 5517 raise ValueError(msg_err) 5518 5519 # Param 5520 param = self.get_param() 5521 log.debug("Param: " + str(param)) 5522 5523 # Param - options 5524 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5525 log.debug("Options: " + str(options)) 5526 5527 # Param - annotations 5528 annotations = ( 5529 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5530 ) 5531 log.debug("Annotations: " + str(annotations)) 5532 5533 # Param - Assembly 5534 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5535 5536 # Annovar database assembly 5537 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5538 if annovar_databases_assembly != "" and not os.path.exists( 5539 annovar_databases_assembly 5540 ): 5541 os.makedirs(annovar_databases_assembly) 5542 5543 # Data 5544 table_variants = self.get_table_variants() 5545 5546 # Check if not empty 5547 log.debug("Check if not empty") 5548 sql_query_chromosomes = ( 5549 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5550 ) 5551 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5552 if not sql_query_chromosomes_df["count"][0]: 5553 log.info(f"VCF empty") 5554 return 5555 5556 # VCF header 5557 vcf_reader = self.get_header() 5558 log.debug("Initial header: " + str(vcf_reader.infos)) 5559 5560 # Existing annotations 5561 for vcf_annotation in self.get_header().infos: 5562 5563 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5564 log.debug( 5565 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5566 ) 5567 5568 force_update_annotation = True 5569 5570 if annotations: 5571 5572 commands = [] 5573 tmp_annotates_vcf_name_list = [] 5574 5575 # Export in VCF 5576 log.debug("Create initial file to annotate") 5577 tmp_vcf = NamedTemporaryFile( 5578 prefix=self.get_prefix(), 5579 dir=self.get_tmp_dir(), 5580 suffix=".vcf.gz", 5581 delete=False, 5582 ) 5583 tmp_vcf_name = tmp_vcf.name 5584 tmp_files.append(tmp_vcf_name) 5585 tmp_files.append(tmp_vcf_name + ".tbi") 5586 5587 # Export VCF file 5588 self.export_variant_vcf( 5589 vcf_file=tmp_vcf_name, 5590 remove_info=".", 5591 add_samples=False, 5592 index=True, 5593 ) 5594 5595 # Create file for field rename 5596 log.debug("Create file for field rename") 5597 tmp_rename = NamedTemporaryFile( 5598 prefix=self.get_prefix(), 5599 dir=self.get_tmp_dir(), 5600 suffix=".rename", 5601 delete=False, 5602 ) 5603 tmp_rename_name = tmp_rename.name 5604 tmp_files.append(tmp_rename_name) 5605 5606 # Check Annovar database 5607 log.debug( 5608 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5609 ) 5610 databases_download_annovar( 5611 folder=annovar_databases, 5612 files=list(annotations.keys()), 5613 assemblies=[assembly], 5614 ) 5615 5616 for annotation in annotations: 5617 annotation_fields = annotations[annotation] 5618 5619 if not annotation_fields: 5620 annotation_fields = {"INFO": None} 5621 5622 log.info(f"Annotations Annovar - database '{annotation}'") 5623 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5624 5625 # Tmp file for annovar 5626 err_files = [] 5627 tmp_annotate_vcf_directory = TemporaryDirectory( 5628 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5629 ) 5630 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5631 tmp_annotate_vcf_name_annovar = ( 5632 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5633 ) 5634 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5635 err_files.append(tmp_annotate_vcf_name_err) 5636 tmp_files.append(tmp_annotate_vcf_name_err) 5637 5638 # Tmp file final vcf annotated by annovar 5639 tmp_annotate_vcf = NamedTemporaryFile( 5640 prefix=self.get_prefix(), 5641 dir=self.get_tmp_dir(), 5642 suffix=".vcf.gz", 5643 delete=False, 5644 ) 5645 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5646 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5647 tmp_files.append(tmp_annotate_vcf_name) 5648 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5649 5650 # Number of fields 5651 annotation_list = [] 5652 annotation_renamed_list = [] 5653 5654 for annotation_field in annotation_fields: 5655 5656 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5657 annotation_fields_new_name = annotation_fields.get( 5658 annotation_field, annotation_field 5659 ) 5660 if not annotation_fields_new_name: 5661 annotation_fields_new_name = annotation_field 5662 5663 if ( 5664 force_update_annotation 5665 or annotation_fields_new_name not in self.get_header().infos 5666 ): 5667 annotation_list.append(annotation_field) 5668 annotation_renamed_list.append(annotation_fields_new_name) 5669 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5670 log.warning( 5671 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5672 ) 5673 5674 # Add rename info 5675 run_parallel_commands( 5676 [ 5677 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5678 ], 5679 1, 5680 ) 5681 5682 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5683 log.debug("annotation_list: " + str(annotation_list)) 5684 5685 # protocol 5686 protocol = annotation 5687 5688 # argument 5689 argument = "" 5690 5691 # operation 5692 operation = "f" 5693 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5694 "ensGene" 5695 ): 5696 operation = "g" 5697 if options.get("genebase", None): 5698 argument = f"""'{options.get("genebase","")}'""" 5699 elif annotation in ["cytoBand"]: 5700 operation = "r" 5701 5702 # argument option 5703 argument_option = "" 5704 if argument != "": 5705 argument_option = " --argument " + argument 5706 5707 # command options 5708 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5709 for option in options: 5710 if option not in ["genebase"]: 5711 command_options += f""" --{option}={options[option]}""" 5712 5713 # Command 5714 5715 # Command - Annovar 5716 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5717 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5718 5719 # Command - start pipe 5720 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5721 5722 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5723 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5724 5725 # Command - Special characters (refGene annotation) 5726 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5727 5728 # Command - Clean empty fields (with value ".") 5729 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5730 5731 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5732 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5733 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5734 # for ann in annotation_renamed_list: 5735 for ann in annotation_list: 5736 annovar_fields_to_keep.append(f"^INFO/{ann}") 5737 5738 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5739 5740 # Command - indexing 5741 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5742 5743 log.debug(f"Annotation - Annovar command: {command_annovar}") 5744 run_parallel_commands([command_annovar], 1) 5745 5746 # Error messages 5747 log.info(f"Error/Warning messages:") 5748 error_message_command_all = [] 5749 error_message_command_warning = [] 5750 error_message_command_err = [] 5751 for err_file in err_files: 5752 with open(err_file, "r") as f: 5753 for line in f: 5754 message = line.strip() 5755 error_message_command_all.append(message) 5756 if line.startswith("[W::") or line.startswith("WARNING"): 5757 error_message_command_warning.append(message) 5758 if line.startswith("[E::") or line.startswith("ERROR"): 5759 error_message_command_err.append( 5760 f"{err_file}: " + message 5761 ) 5762 # log info 5763 for message in list( 5764 set(error_message_command_err + error_message_command_warning) 5765 ): 5766 log.info(f" {message}") 5767 # debug info 5768 for message in list(set(error_message_command_all)): 5769 log.debug(f" {message}") 5770 # failed 5771 if len(error_message_command_err): 5772 log.error("Annotation failed: Error in commands") 5773 raise ValueError("Annotation failed: Error in commands") 5774 5775 if tmp_annotates_vcf_name_list: 5776 5777 # List of annotated files 5778 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5779 5780 # Tmp file 5781 tmp_annotate_vcf = NamedTemporaryFile( 5782 prefix=self.get_prefix(), 5783 dir=self.get_tmp_dir(), 5784 suffix=".vcf.gz", 5785 delete=False, 5786 ) 5787 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5788 tmp_files.append(tmp_annotate_vcf_name) 5789 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5790 err_files.append(tmp_annotate_vcf_name_err) 5791 tmp_files.append(tmp_annotate_vcf_name_err) 5792 5793 # Command merge 5794 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5795 log.info( 5796 f"Annotation Annovar - Annotation merging " 5797 + str(len(tmp_annotates_vcf_name_list)) 5798 + " annotated files" 5799 ) 5800 log.debug(f"Annotation - merge command: {merge_command}") 5801 run_parallel_commands([merge_command], 1) 5802 5803 # Find annotation in header 5804 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5805 header_list = self.read_vcf_header(f) 5806 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5807 5808 for ann in annovar_vcf_header.infos: 5809 if ann not in self.get_header().infos: 5810 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5811 5812 # Update variants 5813 log.info(f"Annotation Annovar - Updating...") 5814 self.update_from_vcf(tmp_annotate_vcf_name) 5815 5816 # Clean files 5817 # Tmp file remove command 5818 if True: 5819 tmp_files_remove_command = "" 5820 if tmp_files: 5821 tmp_files_remove_command = " ".join(tmp_files) 5822 clean_command = f" rm -f {tmp_files_remove_command} " 5823 log.debug(f"Annotation Annovar - Annotation cleaning ") 5824 log.debug(f"Annotation - cleaning command: {clean_command}") 5825 run_parallel_commands([clean_command], 1) 5826 5827 # Parquet 5828 def annotation_parquet(self, threads: int = None) -> None: 5829 """ 5830 It takes a VCF file, and annotates it with a parquet file 5831 5832 :param threads: number of threads to use for the annotation 5833 :return: the value of the variable "result". 5834 """ 5835 5836 # DEBUG 5837 log.debug("Start annotation with parquet databases") 5838 5839 # Threads 5840 if not threads: 5841 threads = self.get_threads() 5842 log.debug("Threads: " + str(threads)) 5843 5844 # DEBUG 5845 delete_tmp = True 5846 if self.get_config().get("verbosity", "warning") in ["debug"]: 5847 delete_tmp = False 5848 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5849 5850 # Config 5851 databases_folders = set( 5852 self.get_config() 5853 .get("folders", {}) 5854 .get("databases", {}) 5855 .get("annotations", ["."]) 5856 + self.get_config() 5857 .get("folders", {}) 5858 .get("databases", {}) 5859 .get("parquet", ["."]) 5860 ) 5861 log.debug("Databases annotations: " + str(databases_folders)) 5862 5863 # Param 5864 annotations = ( 5865 self.get_param() 5866 .get("annotation", {}) 5867 .get("parquet", {}) 5868 .get("annotations", None) 5869 ) 5870 log.debug("Annotations: " + str(annotations)) 5871 5872 # Assembly 5873 assembly = self.get_param().get( 5874 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5875 ) 5876 5877 # Force Update Annotation 5878 force_update_annotation = ( 5879 self.get_param() 5880 .get("annotation", {}) 5881 .get("options", {}) 5882 .get("annotations_update", False) 5883 ) 5884 log.debug(f"force_update_annotation={force_update_annotation}") 5885 force_append_annotation = ( 5886 self.get_param() 5887 .get("annotation", {}) 5888 .get("options", {}) 5889 .get("annotations_append", False) 5890 ) 5891 log.debug(f"force_append_annotation={force_append_annotation}") 5892 5893 # Data 5894 table_variants = self.get_table_variants() 5895 5896 # Check if not empty 5897 log.debug("Check if not empty") 5898 sql_query_chromosomes_df = self.get_query_to_df( 5899 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5900 ) 5901 if not sql_query_chromosomes_df["count"][0]: 5902 log.info(f"VCF empty") 5903 return 5904 5905 # VCF header 5906 vcf_reader = self.get_header() 5907 log.debug("Initial header: " + str(vcf_reader.infos)) 5908 5909 # Nb Variants POS 5910 log.debug("NB Variants Start") 5911 nb_variants = self.conn.execute( 5912 f"SELECT count(*) AS count FROM variants" 5913 ).fetchdf()["count"][0] 5914 log.debug("NB Variants Stop") 5915 5916 # Existing annotations 5917 for vcf_annotation in self.get_header().infos: 5918 5919 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5920 log.debug( 5921 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5922 ) 5923 5924 # Added columns 5925 added_columns = [] 5926 5927 # drop indexes 5928 log.debug(f"Drop indexes...") 5929 self.drop_indexes() 5930 5931 if annotations: 5932 5933 if "ALL" in annotations: 5934 5935 all_param = annotations.get("ALL", {}) 5936 all_param_formats = all_param.get("formats", None) 5937 all_param_releases = all_param.get("releases", None) 5938 5939 databases_infos_dict = self.scan_databases( 5940 database_formats=all_param_formats, 5941 database_releases=all_param_releases, 5942 ) 5943 for database_infos in databases_infos_dict.keys(): 5944 if database_infos not in annotations: 5945 annotations[database_infos] = {"INFO": None} 5946 5947 for annotation in annotations: 5948 5949 if annotation in ["ALL"]: 5950 continue 5951 5952 # Annotation Name 5953 annotation_name = os.path.basename(annotation) 5954 5955 # Annotation fields 5956 annotation_fields = annotations[annotation] 5957 if not annotation_fields: 5958 annotation_fields = {"INFO": None} 5959 5960 log.debug(f"Annotation '{annotation_name}'") 5961 log.debug( 5962 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5963 ) 5964 5965 # Create Database 5966 database = Database( 5967 database=annotation, 5968 databases_folders=databases_folders, 5969 assembly=assembly, 5970 ) 5971 5972 # Find files 5973 parquet_file = database.get_database() 5974 parquet_hdr_file = database.get_header_file() 5975 parquet_type = database.get_type() 5976 5977 # Check if files exists 5978 if not parquet_file or not parquet_hdr_file: 5979 msg_err_list = [] 5980 if not parquet_file: 5981 msg_err_list.append( 5982 f"Annotation failed: Annotation file not found" 5983 ) 5984 if parquet_file and not parquet_hdr_file: 5985 msg_err_list.append( 5986 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 5987 ) 5988 5989 log.error(". ".join(msg_err_list)) 5990 raise ValueError(". ".join(msg_err_list)) 5991 else: 5992 # Get parquet connexion 5993 parquet_sql_attach = database.get_sql_database_attach( 5994 output="query" 5995 ) 5996 if parquet_sql_attach: 5997 self.conn.execute(parquet_sql_attach) 5998 parquet_file_link = database.get_sql_database_link() 5999 # Log 6000 log.debug( 6001 f"Annotation '{annotation_name}' - file: " 6002 + str(parquet_file) 6003 + " and " 6004 + str(parquet_hdr_file) 6005 ) 6006 6007 # Database full header columns 6008 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6009 parquet_hdr_file 6010 ) 6011 # Log 6012 log.debug( 6013 "Annotation database header columns : " 6014 + str(parquet_hdr_vcf_header_columns) 6015 ) 6016 6017 # Load header as VCF object 6018 parquet_hdr_vcf_header_infos = database.get_header().infos 6019 # Log 6020 log.debug( 6021 "Annotation database header: " 6022 + str(parquet_hdr_vcf_header_infos) 6023 ) 6024 6025 # Get extra infos 6026 parquet_columns = database.get_extra_columns() 6027 # Log 6028 log.debug("Annotation database Columns: " + str(parquet_columns)) 6029 6030 # Add extra columns if "ALL" in annotation_fields 6031 # if "ALL" in annotation_fields: 6032 # allow_add_extra_column = True 6033 if "ALL" in annotation_fields and database.get_extra_columns(): 6034 for extra_column in database.get_extra_columns(): 6035 if ( 6036 extra_column not in annotation_fields 6037 and extra_column.replace("INFO/", "") 6038 not in parquet_hdr_vcf_header_infos 6039 ): 6040 parquet_hdr_vcf_header_infos[extra_column] = ( 6041 vcf.parser._Info( 6042 extra_column, 6043 ".", 6044 "String", 6045 f"{extra_column} description", 6046 "unknown", 6047 "unknown", 6048 self.code_type_map["String"], 6049 ) 6050 ) 6051 6052 # For all fields in database 6053 annotation_fields_all = False 6054 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6055 annotation_fields_all = True 6056 annotation_fields = { 6057 key: key for key in parquet_hdr_vcf_header_infos 6058 } 6059 6060 log.debug( 6061 "Annotation database header - All annotations added: " 6062 + str(annotation_fields) 6063 ) 6064 6065 # Init 6066 6067 # List of annotation fields to use 6068 sql_query_annotation_update_info_sets = [] 6069 6070 # List of annotation to agregate 6071 sql_query_annotation_to_agregate = [] 6072 6073 # Number of fields 6074 nb_annotation_field = 0 6075 6076 # Annotation fields processed 6077 annotation_fields_processed = [] 6078 6079 # Columns mapping 6080 map_columns = database.map_columns( 6081 columns=annotation_fields, prefixes=["INFO/"] 6082 ) 6083 6084 # Query dict for fields to remove (update option) 6085 query_dict_remove = {} 6086 6087 # Fetch Anotation fields 6088 for annotation_field in annotation_fields: 6089 6090 # annotation_field_column 6091 annotation_field_column = map_columns.get( 6092 annotation_field, "INFO" 6093 ) 6094 6095 # field new name, if parametered 6096 annotation_fields_new_name = annotation_fields.get( 6097 annotation_field, annotation_field 6098 ) 6099 if not annotation_fields_new_name: 6100 annotation_fields_new_name = annotation_field 6101 6102 # To annotate 6103 # force_update_annotation = True 6104 # force_append_annotation = True 6105 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6106 if annotation_field in parquet_hdr_vcf_header_infos and ( 6107 force_update_annotation 6108 or force_append_annotation 6109 or ( 6110 annotation_fields_new_name 6111 not in self.get_header().infos 6112 ) 6113 ): 6114 6115 # Add field to annotation to process list 6116 annotation_fields_processed.append( 6117 annotation_fields_new_name 6118 ) 6119 6120 # explode infos for the field 6121 annotation_fields_new_name_info_msg = "" 6122 if ( 6123 force_update_annotation 6124 and annotation_fields_new_name 6125 in self.get_header().infos 6126 ): 6127 # Remove field from INFO 6128 query = f""" 6129 UPDATE {table_variants} as table_variants 6130 SET INFO = REGEXP_REPLACE( 6131 concat(table_variants.INFO,''), 6132 ';*{annotation_fields_new_name}=[^;]*', 6133 '' 6134 ) 6135 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6136 """ 6137 annotation_fields_new_name_info_msg = " [update]" 6138 query_dict_remove[ 6139 f"remove 'INFO/{annotation_fields_new_name}'" 6140 ] = query 6141 6142 # Sep between fields in INFO 6143 nb_annotation_field += 1 6144 if nb_annotation_field > 1: 6145 annotation_field_sep = ";" 6146 else: 6147 annotation_field_sep = "" 6148 6149 log.info( 6150 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6151 ) 6152 6153 # Add INFO field to header 6154 parquet_hdr_vcf_header_infos_number = ( 6155 parquet_hdr_vcf_header_infos[annotation_field].num 6156 or "." 6157 ) 6158 parquet_hdr_vcf_header_infos_type = ( 6159 parquet_hdr_vcf_header_infos[annotation_field].type 6160 or "String" 6161 ) 6162 parquet_hdr_vcf_header_infos_description = ( 6163 parquet_hdr_vcf_header_infos[annotation_field].desc 6164 or f"{annotation_field} description" 6165 ) 6166 parquet_hdr_vcf_header_infos_source = ( 6167 parquet_hdr_vcf_header_infos[annotation_field].source 6168 or "unknown" 6169 ) 6170 parquet_hdr_vcf_header_infos_version = ( 6171 parquet_hdr_vcf_header_infos[annotation_field].version 6172 or "unknown" 6173 ) 6174 6175 vcf_reader.infos[annotation_fields_new_name] = ( 6176 vcf.parser._Info( 6177 annotation_fields_new_name, 6178 parquet_hdr_vcf_header_infos_number, 6179 parquet_hdr_vcf_header_infos_type, 6180 parquet_hdr_vcf_header_infos_description, 6181 parquet_hdr_vcf_header_infos_source, 6182 parquet_hdr_vcf_header_infos_version, 6183 self.code_type_map[ 6184 parquet_hdr_vcf_header_infos_type 6185 ], 6186 ) 6187 ) 6188 6189 # Append 6190 if force_append_annotation: 6191 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6192 else: 6193 query_case_when_append = "" 6194 6195 # Annotation/Update query fields 6196 # Found in INFO column 6197 if ( 6198 annotation_field_column == "INFO" 6199 and "INFO" in parquet_hdr_vcf_header_columns 6200 ): 6201 sql_query_annotation_update_info_sets.append( 6202 f""" 6203 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6204 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6205 ELSE '' 6206 END 6207 """ 6208 ) 6209 # Found in a specific column 6210 else: 6211 sql_query_annotation_update_info_sets.append( 6212 f""" 6213 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6214 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6215 ELSE '' 6216 END 6217 """ 6218 ) 6219 sql_query_annotation_to_agregate.append( 6220 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6221 ) 6222 6223 # Not to annotate 6224 else: 6225 6226 if force_update_annotation: 6227 annotation_message = "forced" 6228 else: 6229 annotation_message = "skipped" 6230 6231 if annotation_field not in parquet_hdr_vcf_header_infos: 6232 log.warning( 6233 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6234 ) 6235 if annotation_fields_new_name in self.get_header().infos: 6236 log.warning( 6237 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6238 ) 6239 6240 # Check if ALL fields have to be annotated. Thus concat all INFO field 6241 # allow_annotation_full_info = True 6242 allow_annotation_full_info = not force_append_annotation 6243 6244 if parquet_type in ["regions"]: 6245 allow_annotation_full_info = False 6246 6247 if ( 6248 allow_annotation_full_info 6249 and nb_annotation_field == len(annotation_fields) 6250 and annotation_fields_all 6251 and ( 6252 "INFO" in parquet_hdr_vcf_header_columns 6253 and "INFO" in database.get_extra_columns() 6254 ) 6255 ): 6256 log.debug("Column INFO annotation enabled") 6257 sql_query_annotation_update_info_sets = [] 6258 sql_query_annotation_update_info_sets.append( 6259 f" table_parquet.INFO " 6260 ) 6261 6262 if sql_query_annotation_update_info_sets: 6263 6264 # Annotate 6265 log.info(f"Annotation '{annotation_name}' - Annotation...") 6266 6267 # Join query annotation update info sets for SQL 6268 sql_query_annotation_update_info_sets_sql = ",".join( 6269 sql_query_annotation_update_info_sets 6270 ) 6271 6272 # Check chromosomes list (and variants infos) 6273 sql_query_chromosomes = f""" 6274 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6275 FROM {table_variants} as table_variants 6276 GROUP BY table_variants."#CHROM" 6277 ORDER BY table_variants."#CHROM" 6278 """ 6279 sql_query_chromosomes_df = self.conn.execute( 6280 sql_query_chromosomes 6281 ).df() 6282 sql_query_chromosomes_dict = { 6283 entry["CHROM"]: { 6284 "count": entry["count_variants"], 6285 "min": entry["min_variants"], 6286 "max": entry["max_variants"], 6287 } 6288 for index, entry in sql_query_chromosomes_df.iterrows() 6289 } 6290 6291 # Init 6292 nb_of_query = 0 6293 nb_of_variant_annotated = 0 6294 query_dict = query_dict_remove 6295 6296 # for chrom in sql_query_chromosomes_df["CHROM"]: 6297 for chrom in sql_query_chromosomes_dict: 6298 6299 # Number of variant by chromosome 6300 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6301 chrom, {} 6302 ).get("count", 0) 6303 6304 log.debug( 6305 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6306 ) 6307 6308 # Annotation with regions database 6309 if parquet_type in ["regions"]: 6310 sql_query_annotation_from_clause = f""" 6311 FROM ( 6312 SELECT 6313 '{chrom}' AS \"#CHROM\", 6314 table_variants_from.\"POS\" AS \"POS\", 6315 {",".join(sql_query_annotation_to_agregate)} 6316 FROM {table_variants} as table_variants_from 6317 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6318 table_parquet_from."#CHROM" = '{chrom}' 6319 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6320 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6321 ) 6322 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6323 GROUP BY table_variants_from.\"POS\" 6324 ) 6325 as table_parquet 6326 """ 6327 6328 sql_query_annotation_where_clause = """ 6329 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6330 AND table_parquet.\"POS\" = table_variants.\"POS\" 6331 """ 6332 6333 # Annotation with variants database 6334 else: 6335 sql_query_annotation_from_clause = f""" 6336 FROM {parquet_file_link} as table_parquet 6337 """ 6338 sql_query_annotation_where_clause = f""" 6339 table_variants."#CHROM" = '{chrom}' 6340 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6341 AND table_parquet.\"POS\" = table_variants.\"POS\" 6342 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6343 AND table_parquet.\"REF\" = table_variants.\"REF\" 6344 """ 6345 6346 # Create update query 6347 sql_query_annotation_chrom_interval_pos = f""" 6348 UPDATE {table_variants} as table_variants 6349 SET INFO = 6350 concat( 6351 CASE WHEN table_variants.INFO NOT IN ('','.') 6352 THEN table_variants.INFO 6353 ELSE '' 6354 END 6355 , 6356 CASE WHEN table_variants.INFO NOT IN ('','.') 6357 AND ( 6358 concat({sql_query_annotation_update_info_sets_sql}) 6359 ) 6360 NOT IN ('','.') 6361 THEN ';' 6362 ELSE '' 6363 END 6364 , 6365 {sql_query_annotation_update_info_sets_sql} 6366 ) 6367 {sql_query_annotation_from_clause} 6368 WHERE {sql_query_annotation_where_clause} 6369 ; 6370 """ 6371 6372 # Add update query to dict 6373 query_dict[ 6374 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6375 ] = sql_query_annotation_chrom_interval_pos 6376 6377 nb_of_query = len(query_dict) 6378 num_query = 0 6379 6380 # SET max_expression_depth TO x 6381 self.conn.execute("SET max_expression_depth TO 10000") 6382 6383 for query_name in query_dict: 6384 query = query_dict[query_name] 6385 num_query += 1 6386 log.info( 6387 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6388 ) 6389 result = self.conn.execute(query) 6390 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6391 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6392 log.info( 6393 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6394 ) 6395 6396 log.info( 6397 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6398 ) 6399 6400 else: 6401 6402 log.info( 6403 f"Annotation '{annotation_name}' - No Annotations available" 6404 ) 6405 6406 log.debug("Final header: " + str(vcf_reader.infos)) 6407 6408 # Remove added columns 6409 for added_column in added_columns: 6410 self.drop_column(column=added_column) 6411 6412 def annotation_splice(self, threads: int = None) -> None: 6413 """ 6414 This function annotate with snpEff 6415 6416 :param threads: The number of threads to use 6417 :return: the value of the variable "return_value". 6418 """ 6419 6420 # DEBUG 6421 log.debug("Start annotation with splice tools") 6422 6423 # Threads 6424 if not threads: 6425 threads = self.get_threads() 6426 log.debug("Threads: " + str(threads)) 6427 6428 # DEBUG 6429 delete_tmp = True 6430 if self.get_config().get("verbosity", "warning") in ["debug"]: 6431 delete_tmp = False 6432 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6433 6434 # Config 6435 config = self.get_config() 6436 log.debug("Config: " + str(config)) 6437 splice_config = config.get("tools", {}).get("splice", {}) 6438 if not splice_config: 6439 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6440 msg_err = "No Splice tool config" 6441 raise ValueError(msg_err) 6442 log.debug(f"splice_config: {splice_config}") 6443 6444 # Config - Folders - Databases 6445 databases_folders = ( 6446 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6447 ) 6448 log.debug("Databases annotations: " + str(databases_folders)) 6449 6450 # Splice docker image 6451 splice_docker_image = splice_config.get("docker").get("image") 6452 6453 # Pull splice image if it's not already there 6454 if not check_docker_image_exists(splice_docker_image): 6455 log.warning( 6456 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6457 ) 6458 try: 6459 command(f"docker pull {splice_config.get('docker').get('image')}") 6460 except subprocess.CalledProcessError: 6461 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6462 log.error(msg_err) 6463 raise ValueError(msg_err) 6464 6465 # Config - splice databases 6466 splice_databases = ( 6467 config.get("folders", {}) 6468 .get("databases", {}) 6469 .get("splice", DEFAULT_SPLICE_FOLDER) 6470 ) 6471 splice_databases = full_path(splice_databases) 6472 6473 # Param 6474 param = self.get_param() 6475 log.debug("Param: " + str(param)) 6476 6477 # Param 6478 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6479 log.debug("Options: " + str(options)) 6480 6481 # Data 6482 table_variants = self.get_table_variants() 6483 6484 # Check if not empty 6485 log.debug("Check if not empty") 6486 sql_query_chromosomes = ( 6487 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6488 ) 6489 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6490 log.info("VCF empty") 6491 return None 6492 6493 # Export in VCF 6494 log.debug("Create initial file to annotate") 6495 6496 # Create output folder / work folder 6497 if options.get("output_folder", ""): 6498 output_folder = options.get("output_folder", "") 6499 if not os.path.exists(output_folder): 6500 Path(output_folder).mkdir(parents=True, exist_ok=True) 6501 else: 6502 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6503 if not os.path.exists(output_folder): 6504 Path(output_folder).mkdir(parents=True, exist_ok=True) 6505 6506 if options.get("workdir", ""): 6507 workdir = options.get("workdir", "") 6508 else: 6509 workdir = "/work" 6510 6511 # Create tmp VCF file 6512 tmp_vcf = NamedTemporaryFile( 6513 prefix=self.get_prefix(), 6514 dir=output_folder, 6515 suffix=".vcf", 6516 delete=False, 6517 ) 6518 tmp_vcf_name = tmp_vcf.name 6519 6520 # VCF header 6521 header = self.get_header() 6522 6523 # Existing annotations 6524 for vcf_annotation in self.get_header().infos: 6525 6526 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6527 log.debug( 6528 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6529 ) 6530 6531 # Memory limit 6532 if config.get("memory", None): 6533 memory_limit = config.get("memory", "8G").upper() 6534 # upper() 6535 else: 6536 memory_limit = "8G" 6537 log.debug(f"memory_limit: {memory_limit}") 6538 6539 # Check number of variants to annotate 6540 where_clause_regex_spliceai = r"SpliceAI_\w+" 6541 where_clause_regex_spip = r"SPiP_\w+" 6542 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6543 df_list_of_variants_to_annotate = self.get_query_to_df( 6544 query=f""" SELECT * FROM variants {where_clause} """ 6545 ) 6546 if len(df_list_of_variants_to_annotate) == 0: 6547 log.warning( 6548 f"No variants to annotate with splice. Variants probably already annotated with splice" 6549 ) 6550 return None 6551 else: 6552 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6553 6554 # Export VCF file 6555 self.export_variant_vcf( 6556 vcf_file=tmp_vcf_name, 6557 remove_info=True, 6558 add_samples=True, 6559 index=False, 6560 where_clause=where_clause, 6561 ) 6562 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6563 if any(value for value in splice_config.values() if value is None): 6564 log.warning("At least one splice config parameter is empty") 6565 # exit annotation_splice 6566 return None 6567 6568 # Params in splice nf 6569 def check_values(dico: dict): 6570 """ 6571 Ensure parameters for NF splice pipeline 6572 """ 6573 for key, val in dico.items(): 6574 if key == "genome": 6575 if any( 6576 assemb in options.get("genome", {}) 6577 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6578 ): 6579 yield f"--{key} hg19" 6580 elif any( 6581 assemb in options.get("genome", {}) 6582 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6583 ): 6584 yield f"--{key} hg38" 6585 elif ( 6586 (isinstance(val, str) and val) 6587 or isinstance(val, int) 6588 or isinstance(val, bool) 6589 ): 6590 yield f"--{key} {val}" 6591 6592 # Genome 6593 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6594 options["genome"] = genome 6595 # NF params 6596 nf_params = [] 6597 # Add options 6598 if options: 6599 log.debug(options) 6600 nf_params = list(check_values(options)) 6601 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6602 else: 6603 log.debug("No NF params provided") 6604 # Add threads 6605 if "threads" not in options.keys(): 6606 nf_params.append(f"--threads {threads}") 6607 # Genome path 6608 genome_path = find_genome( 6609 config.get("folders", {}) 6610 .get("databases", {}) 6611 .get("genomes", DEFAULT_GENOME_FOLDER), 6612 file=f"{genome}.fa", 6613 ) 6614 # Add genome path 6615 if not genome_path: 6616 raise ValueError( 6617 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6618 ) 6619 else: 6620 log.debug(f"Genome: {genome_path}") 6621 nf_params.append(f"--genome_path {genome_path}") 6622 6623 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6624 """ 6625 Setting up updated databases for SPiP and SpliceAI 6626 """ 6627 6628 try: 6629 6630 # SpliceAI assembly transcriptome 6631 spliceai_assembly = os.path.join( 6632 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6633 options.get("genome"), 6634 "transcriptome", 6635 ) 6636 spip_assembly = options.get("genome") 6637 6638 spip = find( 6639 f"transcriptome_{spip_assembly}.RData", 6640 config.get("folders", {}).get("databases", {}).get("spip", {}), 6641 ) 6642 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6643 log.debug(f"SPiP annotations: {spip}") 6644 log.debug(f"SpliceAI annotations: {spliceai}") 6645 if spip and spliceai: 6646 return [ 6647 f"--spip_transcriptome {spip}", 6648 f"--spliceai_transcriptome {spliceai}", 6649 ] 6650 else: 6651 log.warning( 6652 "Can't find splice databases in configuration, use annotations file from image" 6653 ) 6654 except TypeError: 6655 log.warning( 6656 "Can't find splice databases in configuration, use annotations file from image" 6657 ) 6658 return [] 6659 6660 # Add options, check if transcriptome option have already beend provided 6661 if ( 6662 "spip_transcriptome" not in nf_params 6663 and "spliceai_transcriptome" not in nf_params 6664 ): 6665 splice_reference = splice_annotations(options, config) 6666 if splice_reference: 6667 nf_params.extend(splice_reference) 6668 # nf_params.append(f"--output_folder {output_folder}") 6669 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6670 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6671 log.debug(cmd) 6672 splice_config["docker"]["command"] = cmd 6673 6674 # Ensure proxy is set 6675 proxy = [ 6676 f"-e {var}={os.getenv(var)}" 6677 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6678 if os.getenv(var) is not None 6679 ] 6680 docker_cmd = get_bin_command( 6681 tool="splice", 6682 bin_type="docker", 6683 config=config, 6684 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6685 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6686 ) 6687 # print(docker_cmd) 6688 # exit() 6689 # Docker debug 6690 # if splice_config.get("rm_container"): 6691 # rm_container = "--rm" 6692 # else: 6693 # rm_container = "" 6694 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6695 log.debug(docker_cmd) 6696 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6697 log.debug(res.stdout) 6698 if res.stderr: 6699 log.error(res.stderr) 6700 res.check_returncode() 6701 # Update variants 6702 log.info("Annotation - Updating...") 6703 # Test find output vcf 6704 log.debug( 6705 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6706 ) 6707 output_vcf = [] 6708 # Wrong folder to look in 6709 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6710 if ( 6711 files 6712 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6713 ): 6714 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6715 # log.debug(os.listdir(options.get("output_folder"))) 6716 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6717 if not output_vcf: 6718 log.debug( 6719 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6720 ) 6721 else: 6722 # Get new header from annotated vcf 6723 log.debug(f"Initial header: {len(header.infos)} fields") 6724 # Create new header with splice infos 6725 new_vcf = Variants(input=output_vcf[0]) 6726 new_vcf_header = new_vcf.get_header().infos 6727 for keys, infos in new_vcf_header.items(): 6728 if keys not in header.infos.keys(): 6729 header.infos[keys] = infos 6730 log.debug(f"New header: {len(header.infos)} fields") 6731 log.debug(f"Splice tmp output: {output_vcf[0]}") 6732 self.update_from_vcf(output_vcf[0]) 6733 6734 # Remove file 6735 remove_if_exists(output_vcf) 6736 6737 ### 6738 # Prioritization 6739 ### 6740 6741 def get_config_default(self, name: str) -> dict: 6742 """ 6743 The function `get_config_default` returns a dictionary containing default configurations for 6744 various calculations and prioritizations. 6745 6746 :param name: The `get_config_default` function returns a dictionary containing default 6747 configurations for different calculations and prioritizations. The `name` parameter is used to 6748 specify which specific configuration to retrieve from the dictionary 6749 :type name: str 6750 :return: The function `get_config_default` returns a dictionary containing default configuration 6751 settings for different calculations and prioritizations. The specific configuration settings are 6752 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6753 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6754 returned. If there is no match, an empty dictionary is returned. 6755 """ 6756 6757 config_default = { 6758 "calculations": { 6759 "variant_chr_pos_alt_ref": { 6760 "type": "sql", 6761 "name": "variant_chr_pos_alt_ref", 6762 "description": "Create a variant ID with chromosome, position, alt and ref", 6763 "available": False, 6764 "output_column_name": "variant_chr_pos_alt_ref", 6765 "output_column_type": "String", 6766 "output_column_description": "variant ID with chromosome, position, alt and ref", 6767 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6768 "operation_info": True, 6769 }, 6770 "VARTYPE": { 6771 "type": "sql", 6772 "name": "VARTYPE", 6773 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6774 "available": True, 6775 "table": "variants", 6776 "output_column_name": "VARTYPE", 6777 "output_column_type": "String", 6778 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6779 "operation_query": """ 6780 CASE 6781 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6782 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6783 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6784 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6785 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6786 ELSE 'UNDEFINED' 6787 END 6788 """, 6789 "info_fields": ["SVTYPE"], 6790 "operation_info": True, 6791 }, 6792 "snpeff_hgvs": { 6793 "type": "python", 6794 "name": "snpeff_hgvs", 6795 "description": "HGVS nomenclatures from snpEff annotation", 6796 "available": True, 6797 "function_name": "calculation_extract_snpeff_hgvs", 6798 "function_params": ["snpeff_hgvs", "ANN"], 6799 }, 6800 "snpeff_ann_explode": { 6801 "type": "python", 6802 "name": "snpeff_ann_explode", 6803 "description": "Explode snpEff annotations with uniquify values", 6804 "available": True, 6805 "function_name": "calculation_snpeff_ann_explode", 6806 "function_params": [False, "fields", "snpeff_", "ANN"], 6807 }, 6808 "snpeff_ann_explode_uniquify": { 6809 "type": "python", 6810 "name": "snpeff_ann_explode_uniquify", 6811 "description": "Explode snpEff annotations", 6812 "available": True, 6813 "function_name": "calculation_snpeff_ann_explode", 6814 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6815 }, 6816 "snpeff_ann_explode_json": { 6817 "type": "python", 6818 "name": "snpeff_ann_explode_json", 6819 "description": "Explode snpEff annotations in JSON format", 6820 "available": True, 6821 "function_name": "calculation_snpeff_ann_explode", 6822 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6823 }, 6824 "NOMEN": { 6825 "type": "python", 6826 "name": "NOMEN", 6827 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6828 "available": True, 6829 "function_name": "calculation_extract_nomen", 6830 "function_params": [], 6831 }, 6832 "FINDBYPIPELINE": { 6833 "type": "python", 6834 "name": "FINDBYPIPELINE", 6835 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6836 "available": True, 6837 "function_name": "calculation_find_by_pipeline", 6838 "function_params": ["findbypipeline"], 6839 }, 6840 "FINDBYSAMPLE": { 6841 "type": "python", 6842 "name": "FINDBYSAMPLE", 6843 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6844 "available": True, 6845 "function_name": "calculation_find_by_pipeline", 6846 "function_params": ["findbysample"], 6847 }, 6848 "GENOTYPECONCORDANCE": { 6849 "type": "python", 6850 "name": "GENOTYPECONCORDANCE", 6851 "description": "Concordance of genotype for multi caller VCF", 6852 "available": True, 6853 "function_name": "calculation_genotype_concordance", 6854 "function_params": [], 6855 }, 6856 "BARCODE": { 6857 "type": "python", 6858 "name": "BARCODE", 6859 "description": "BARCODE as VaRank tool", 6860 "available": True, 6861 "function_name": "calculation_barcode", 6862 "function_params": [], 6863 }, 6864 "BARCODEFAMILY": { 6865 "type": "python", 6866 "name": "BARCODEFAMILY", 6867 "description": "BARCODEFAMILY as VaRank tool", 6868 "available": True, 6869 "function_name": "calculation_barcode_family", 6870 "function_params": ["BCF"], 6871 }, 6872 "TRIO": { 6873 "type": "python", 6874 "name": "TRIO", 6875 "description": "Inheritance for a trio family", 6876 "available": True, 6877 "function_name": "calculation_trio", 6878 "function_params": [], 6879 }, 6880 "VAF": { 6881 "type": "python", 6882 "name": "VAF", 6883 "description": "Variant Allele Frequency (VAF) harmonization", 6884 "available": True, 6885 "function_name": "calculation_vaf_normalization", 6886 "function_params": [], 6887 }, 6888 "VAF_stats": { 6889 "type": "python", 6890 "name": "VAF_stats", 6891 "description": "Variant Allele Frequency (VAF) statistics", 6892 "available": True, 6893 "function_name": "calculation_genotype_stats", 6894 "function_params": ["VAF"], 6895 }, 6896 "DP_stats": { 6897 "type": "python", 6898 "name": "DP_stats", 6899 "description": "Depth (DP) statistics", 6900 "available": True, 6901 "function_name": "calculation_genotype_stats", 6902 "function_params": ["DP"], 6903 }, 6904 "variant_id": { 6905 "type": "python", 6906 "name": "variant_id", 6907 "description": "Variant ID generated from variant position and type", 6908 "available": True, 6909 "function_name": "calculation_variant_id", 6910 "function_params": [], 6911 }, 6912 "transcripts_json": { 6913 "type": "python", 6914 "name": "transcripts_json", 6915 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6916 "available": True, 6917 "function_name": "calculation_transcripts_annotation", 6918 "function_params": ["transcripts_json", None], 6919 }, 6920 "transcripts_ann": { 6921 "type": "python", 6922 "name": "transcripts_ann", 6923 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6924 "available": True, 6925 "function_name": "calculation_transcripts_annotation", 6926 "function_params": [None, "transcripts_ann"], 6927 }, 6928 "transcripts_annotations": { 6929 "type": "python", 6930 "name": "transcripts_annotations", 6931 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6932 "available": True, 6933 "function_name": "calculation_transcripts_annotation", 6934 "function_params": [None, None], 6935 }, 6936 "transcripts_prioritization": { 6937 "type": "python", 6938 "name": "transcripts_prioritization", 6939 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6940 "available": True, 6941 "function_name": "calculation_transcripts_prioritization", 6942 "function_params": [], 6943 }, 6944 "transcripts_export": { 6945 "type": "python", 6946 "name": "transcripts_export", 6947 "description": "Export transcripts table/view as a file (using param.json)", 6948 "available": True, 6949 "function_name": "calculation_transcripts_export", 6950 "function_params": [], 6951 }, 6952 }, 6953 "prioritizations": { 6954 "default": { 6955 "ANN2": [ 6956 { 6957 "type": "contains", 6958 "value": "HIGH", 6959 "score": 5, 6960 "flag": "PASS", 6961 "comment": [ 6962 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6963 ], 6964 }, 6965 { 6966 "type": "contains", 6967 "value": "MODERATE", 6968 "score": 3, 6969 "flag": "PASS", 6970 "comment": [ 6971 "A non-disruptive variant that might change protein effectiveness" 6972 ], 6973 }, 6974 { 6975 "type": "contains", 6976 "value": "LOW", 6977 "score": 0, 6978 "flag": "FILTERED", 6979 "comment": [ 6980 "Assumed to be mostly harmless or unlikely to change protein behavior" 6981 ], 6982 }, 6983 { 6984 "type": "contains", 6985 "value": "MODIFIER", 6986 "score": 0, 6987 "flag": "FILTERED", 6988 "comment": [ 6989 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6990 ], 6991 }, 6992 ], 6993 } 6994 }, 6995 } 6996 6997 return config_default.get(name, None) 6998 6999 def get_config_json( 7000 self, name: str, config_dict: dict = {}, config_file: str = None 7001 ) -> dict: 7002 """ 7003 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7004 default values, a dictionary, and a file. 7005 7006 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7007 the name of the configuration. It is used to identify and retrieve the configuration settings 7008 for a specific component or module 7009 :type name: str 7010 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7011 dictionary that allows you to provide additional configuration settings or overrides. When you 7012 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7013 the key is the configuration setting you want to override or 7014 :type config_dict: dict 7015 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7016 specify the path to a configuration file that contains additional settings. If provided, the 7017 function will read the contents of this file and update the configuration dictionary with the 7018 values found in the file, overriding any existing values with the 7019 :type config_file: str 7020 :return: The function `get_config_json` returns a dictionary containing the configuration 7021 settings. 7022 """ 7023 7024 # Create with default prioritizations 7025 config_default = self.get_config_default(name=name) 7026 configuration = config_default 7027 # log.debug(f"configuration={configuration}") 7028 7029 # Replace prioritizations from dict 7030 for config in config_dict: 7031 configuration[config] = config_dict[config] 7032 7033 # Replace prioritizations from file 7034 config_file = full_path(config_file) 7035 if config_file: 7036 if os.path.exists(config_file): 7037 with open(config_file) as config_file_content: 7038 config_file_dict = json.load(config_file_content) 7039 for config in config_file_dict: 7040 configuration[config] = config_file_dict[config] 7041 else: 7042 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7043 log.error(msg_error) 7044 raise ValueError(msg_error) 7045 7046 return configuration 7047 7048 def prioritization( 7049 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7050 ) -> bool: 7051 """ 7052 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7053 prioritizes variants based on configured profiles and criteria. 7054 7055 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7056 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7057 a table name is provided, the method will prioritize the variants in that specific table 7058 :type table: str 7059 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7060 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7061 provided, the code will use a default prefix value of "PZ" 7062 :type pz_prefix: str 7063 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7064 additional parameters specific to the prioritization process. These parameters can include 7065 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7066 configurations needed for the prioritization of variants in a V 7067 :type pz_param: dict 7068 :return: A boolean value (True) is being returned from the `prioritization` function. 7069 """ 7070 7071 # Config 7072 config = self.get_config() 7073 7074 # Param 7075 param = self.get_param() 7076 7077 # Prioritization param 7078 if pz_param is not None: 7079 prioritization_param = pz_param 7080 else: 7081 prioritization_param = param.get("prioritization", {}) 7082 7083 # Configuration profiles 7084 prioritization_config_file = prioritization_param.get( 7085 "prioritization_config", None 7086 ) 7087 prioritization_config_file = full_path(prioritization_config_file) 7088 prioritizations_config = self.get_config_json( 7089 name="prioritizations", config_file=prioritization_config_file 7090 ) 7091 7092 # Prioritization prefix 7093 pz_prefix_default = "PZ" 7094 if pz_prefix is None: 7095 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7096 7097 # Prioritization options 7098 profiles = prioritization_param.get("profiles", []) 7099 if isinstance(profiles, str): 7100 profiles = profiles.split(",") 7101 pzfields = prioritization_param.get( 7102 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7103 ) 7104 if isinstance(pzfields, str): 7105 pzfields = pzfields.split(",") 7106 default_profile = prioritization_param.get("default_profile", None) 7107 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7108 prioritization_score_mode = prioritization_param.get( 7109 "prioritization_score_mode", "HOWARD" 7110 ) 7111 7112 # Quick Prioritizations 7113 prioritizations = param.get("prioritizations", None) 7114 if prioritizations: 7115 log.info("Quick Prioritization:") 7116 for profile in prioritizations.split(","): 7117 if profile not in profiles: 7118 profiles.append(profile) 7119 log.info(f" {profile}") 7120 7121 # If profile "ALL" provided, all profiles in the config profiles 7122 if "ALL" in profiles: 7123 profiles = list(prioritizations_config.keys()) 7124 7125 for profile in profiles: 7126 if prioritizations_config.get(profile, None): 7127 log.debug(f"Profile '{profile}' configured") 7128 else: 7129 msg_error = f"Profile '{profile}' NOT configured" 7130 log.error(msg_error) 7131 raise ValueError(msg_error) 7132 7133 if profiles: 7134 log.info(f"Prioritization... ") 7135 else: 7136 log.debug(f"No profile defined") 7137 return False 7138 7139 if not default_profile and len(profiles): 7140 default_profile = profiles[0] 7141 7142 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7143 log.debug("Profiles to check: " + str(list(profiles))) 7144 7145 # Variables 7146 if table is not None: 7147 table_variants = table 7148 else: 7149 table_variants = self.get_table_variants(clause="update") 7150 log.debug(f"Table to prioritize: {table_variants}") 7151 7152 # Added columns 7153 added_columns = [] 7154 7155 # Create list of PZfields 7156 # List of PZFields 7157 list_of_pzfields_original = pzfields + [ 7158 pzfield + pzfields_sep + profile 7159 for pzfield in pzfields 7160 for profile in profiles 7161 ] 7162 list_of_pzfields = [] 7163 log.debug(f"{list_of_pzfields_original}") 7164 7165 # Remove existing PZfields to use if exists 7166 for pzfield in list_of_pzfields_original: 7167 if self.get_header().infos.get(pzfield, None) is None: 7168 list_of_pzfields.append(pzfield) 7169 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7170 else: 7171 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7172 7173 if list_of_pzfields: 7174 7175 # Explode Infos prefix 7176 explode_infos_prefix = self.get_explode_infos_prefix() 7177 7178 # PZfields tags description 7179 PZfields_INFOS = { 7180 f"{pz_prefix}Tags": { 7181 "ID": f"{pz_prefix}Tags", 7182 "Number": ".", 7183 "Type": "String", 7184 "Description": "Variant tags based on annotation criteria", 7185 }, 7186 f"{pz_prefix}Score": { 7187 "ID": f"{pz_prefix}Score", 7188 "Number": 1, 7189 "Type": "Integer", 7190 "Description": "Variant score based on annotation criteria", 7191 }, 7192 f"{pz_prefix}Flag": { 7193 "ID": f"{pz_prefix}Flag", 7194 "Number": 1, 7195 "Type": "String", 7196 "Description": "Variant flag based on annotation criteria", 7197 }, 7198 f"{pz_prefix}Comment": { 7199 "ID": f"{pz_prefix}Comment", 7200 "Number": ".", 7201 "Type": "String", 7202 "Description": "Variant comment based on annotation criteria", 7203 }, 7204 f"{pz_prefix}Infos": { 7205 "ID": f"{pz_prefix}Infos", 7206 "Number": ".", 7207 "Type": "String", 7208 "Description": "Variant infos based on annotation criteria", 7209 }, 7210 f"{pz_prefix}Class": { 7211 "ID": f"{pz_prefix}Class", 7212 "Number": ".", 7213 "Type": "String", 7214 "Description": "Variant class based on annotation criteria", 7215 }, 7216 } 7217 7218 # Create INFO fields if not exist 7219 for field in PZfields_INFOS: 7220 field_ID = PZfields_INFOS[field]["ID"] 7221 field_description = PZfields_INFOS[field]["Description"] 7222 if field_ID not in self.get_header().infos and field_ID in pzfields: 7223 field_description = ( 7224 PZfields_INFOS[field]["Description"] 7225 + f", profile {default_profile}" 7226 ) 7227 self.get_header().infos[field_ID] = vcf.parser._Info( 7228 field_ID, 7229 PZfields_INFOS[field]["Number"], 7230 PZfields_INFOS[field]["Type"], 7231 field_description, 7232 "unknown", 7233 "unknown", 7234 code_type_map[PZfields_INFOS[field]["Type"]], 7235 ) 7236 7237 # Create INFO fields if not exist for each profile 7238 for profile in prioritizations_config: 7239 if profile in profiles or profiles == []: 7240 for field in PZfields_INFOS: 7241 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7242 field_description = ( 7243 PZfields_INFOS[field]["Description"] 7244 + f", profile {profile}" 7245 ) 7246 if ( 7247 field_ID not in self.get_header().infos 7248 and field in pzfields 7249 ): 7250 self.get_header().infos[field_ID] = vcf.parser._Info( 7251 field_ID, 7252 PZfields_INFOS[field]["Number"], 7253 PZfields_INFOS[field]["Type"], 7254 field_description, 7255 "unknown", 7256 "unknown", 7257 code_type_map[PZfields_INFOS[field]["Type"]], 7258 ) 7259 7260 # Header 7261 for pzfield in list_of_pzfields: 7262 if re.match(f"{pz_prefix}Score.*", pzfield): 7263 added_column = self.add_column( 7264 table_name=table_variants, 7265 column_name=pzfield, 7266 column_type="INTEGER", 7267 default_value="0", 7268 ) 7269 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7270 added_column = self.add_column( 7271 table_name=table_variants, 7272 column_name=pzfield, 7273 column_type="BOOLEAN", 7274 default_value="1", 7275 ) 7276 elif re.match(f"{pz_prefix}Class.*", pzfield): 7277 added_column = self.add_column( 7278 table_name=table_variants, 7279 column_name=pzfield, 7280 column_type="VARCHAR[]", 7281 default_value="null", 7282 ) 7283 else: 7284 added_column = self.add_column( 7285 table_name=table_variants, 7286 column_name=pzfield, 7287 column_type="STRING", 7288 default_value="''", 7289 ) 7290 added_columns.append(added_column) 7291 7292 # Profiles 7293 if profiles: 7294 7295 # foreach profile in configuration file 7296 for profile in prioritizations_config: 7297 7298 # If profile is asked in param, or ALL are asked (empty profile []) 7299 if profile in profiles or profiles == []: 7300 log.info(f"Profile '{profile}'") 7301 7302 sql_set_info_option = "" 7303 7304 sql_set_info = [] 7305 7306 # PZ fields set 7307 7308 # PZScore 7309 if ( 7310 f"{pz_prefix}Score{pzfields_sep}{profile}" 7311 in list_of_pzfields 7312 ): 7313 sql_set_info.append( 7314 f""" 7315 concat( 7316 '{pz_prefix}Score{pzfields_sep}{profile}=', 7317 {pz_prefix}Score{pzfields_sep}{profile} 7318 ) 7319 """ 7320 ) 7321 if ( 7322 profile == default_profile 7323 and f"{pz_prefix}Score" in list_of_pzfields 7324 ): 7325 sql_set_info.append( 7326 f""" 7327 concat( 7328 '{pz_prefix}Score=', 7329 {pz_prefix}Score{pzfields_sep}{profile} 7330 ) 7331 """ 7332 ) 7333 7334 # PZFlag 7335 if ( 7336 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7337 in list_of_pzfields 7338 ): 7339 sql_set_info.append( 7340 f""" 7341 concat( 7342 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7343 CASE 7344 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7345 THEN 'PASS' 7346 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7347 THEN 'FILTERED' 7348 END 7349 ) 7350 """ 7351 ) 7352 if ( 7353 profile == default_profile 7354 and f"{pz_prefix}Flag" in list_of_pzfields 7355 ): 7356 sql_set_info.append( 7357 f""" 7358 concat( 7359 '{pz_prefix}Flag=', 7360 CASE 7361 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7362 THEN 'PASS' 7363 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7364 THEN 'FILTERED' 7365 END 7366 ) 7367 """ 7368 ) 7369 7370 # PZClass 7371 if ( 7372 f"{pz_prefix}Class{pzfields_sep}{profile}" 7373 in list_of_pzfields 7374 ): 7375 sql_set_info.append( 7376 f""" 7377 concat( 7378 '{pz_prefix}Class{pzfields_sep}{profile}=', 7379 CASE 7380 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7381 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7382 ELSE '.' 7383 END 7384 ) 7385 7386 """ 7387 ) 7388 if ( 7389 profile == default_profile 7390 and f"{pz_prefix}Class" in list_of_pzfields 7391 ): 7392 sql_set_info.append( 7393 f""" 7394 concat( 7395 '{pz_prefix}Class=', 7396 CASE 7397 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7398 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7399 ELSE '.' 7400 END 7401 ) 7402 """ 7403 ) 7404 7405 # PZComment 7406 if ( 7407 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7408 in list_of_pzfields 7409 ): 7410 sql_set_info.append( 7411 f""" 7412 CASE 7413 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7414 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7415 ELSE '' 7416 END 7417 """ 7418 ) 7419 if ( 7420 profile == default_profile 7421 and f"{pz_prefix}Comment" in list_of_pzfields 7422 ): 7423 sql_set_info.append( 7424 f""" 7425 CASE 7426 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7427 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7428 ELSE '' 7429 END 7430 """ 7431 ) 7432 7433 # PZInfos 7434 if ( 7435 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7436 in list_of_pzfields 7437 ): 7438 sql_set_info.append( 7439 f""" 7440 CASE 7441 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7442 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7443 ELSE '' 7444 END 7445 """ 7446 ) 7447 if ( 7448 profile == default_profile 7449 and f"{pz_prefix}Infos" in list_of_pzfields 7450 ): 7451 sql_set_info.append( 7452 f""" 7453 CASE 7454 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7455 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7456 ELSE '' 7457 END 7458 """ 7459 ) 7460 7461 # Merge PZfields 7462 sql_set_info_option = "" 7463 sql_set_sep = "" 7464 for sql_set in sql_set_info: 7465 if sql_set_sep: 7466 sql_set_info_option += f""" 7467 , concat('{sql_set_sep}', {sql_set}) 7468 """ 7469 else: 7470 sql_set_info_option += f""" 7471 , {sql_set} 7472 """ 7473 sql_set_sep = ";" 7474 7475 sql_queries = [] 7476 for annotation in prioritizations_config[profile]: 7477 7478 # skip special sections 7479 if annotation.startswith("_"): 7480 continue 7481 7482 # For each criterions 7483 for criterion in prioritizations_config[profile][ 7484 annotation 7485 ]: 7486 7487 # Criterion mode 7488 criterion_mode = None 7489 if np.any( 7490 np.isin(list(criterion.keys()), ["type", "value"]) 7491 ): 7492 criterion_mode = "operation" 7493 elif np.any( 7494 np.isin(list(criterion.keys()), ["sql", "fields"]) 7495 ): 7496 criterion_mode = "sql" 7497 log.debug(f"Criterion Mode: {criterion_mode}") 7498 7499 # Criterion parameters 7500 criterion_type = criterion.get("type", None) 7501 criterion_value = criterion.get("value", None) 7502 criterion_sql = criterion.get("sql", None) 7503 criterion_fields = criterion.get("fields", None) 7504 criterion_score = criterion.get("score", 0) 7505 criterion_flag = criterion.get("flag", "PASS") 7506 criterion_class = criterion.get("class", None) 7507 criterion_flag_bool = criterion_flag == "PASS" 7508 criterion_comment = ( 7509 ", ".join(criterion.get("comment", [])) 7510 .replace("'", "''") 7511 .replace(";", ",") 7512 .replace("\t", " ") 7513 ) 7514 criterion_infos = ( 7515 str(criterion) 7516 .replace("'", "''") 7517 .replace(";", ",") 7518 .replace("\t", " ") 7519 ) 7520 7521 # SQL 7522 if criterion_sql is not None and isinstance( 7523 criterion_sql, list 7524 ): 7525 criterion_sql = " ".join(criterion_sql) 7526 7527 # Fields and explode 7528 if criterion_fields is None: 7529 criterion_fields = [annotation] 7530 if not isinstance(criterion_fields, list): 7531 criterion_fields = str(criterion_fields).split(",") 7532 7533 # Class 7534 if criterion_class is not None and not isinstance( 7535 criterion_class, list 7536 ): 7537 criterion_class = str(criterion_class).split(",") 7538 7539 for annotation_field in criterion_fields: 7540 7541 # Explode specific annotation 7542 log.debug( 7543 f"Explode annotation '{annotation_field}'" 7544 ) 7545 added_columns += self.explode_infos( 7546 prefix=explode_infos_prefix, 7547 fields=[annotation_field], 7548 table=table_variants, 7549 ) 7550 extra_infos = self.get_extra_infos( 7551 table=table_variants 7552 ) 7553 7554 # Check if annotation field is present 7555 if ( 7556 f"{explode_infos_prefix}{annotation_field}" 7557 not in extra_infos 7558 ): 7559 msq_err = f"Annotation '{annotation_field}' not in data" 7560 log.error(msq_err) 7561 raise ValueError(msq_err) 7562 else: 7563 log.debug( 7564 f"Annotation '{annotation_field}' in data" 7565 ) 7566 7567 sql_set = [] 7568 sql_set_info = [] 7569 7570 # PZ fields set 7571 7572 # PZScore 7573 if ( 7574 f"{pz_prefix}Score{pzfields_sep}{profile}" 7575 in list_of_pzfields 7576 ): 7577 # if prioritization_score_mode == "HOWARD": 7578 # sql_set.append( 7579 # f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7580 # ) 7581 # VaRank prioritization score mode 7582 if prioritization_score_mode == "VaRank": 7583 sql_set.append( 7584 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7585 ) 7586 # default HOWARD prioritization score mode 7587 else: 7588 sql_set.append( 7589 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7590 ) 7591 7592 # PZFlag 7593 if ( 7594 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7595 in list_of_pzfields 7596 ): 7597 sql_set.append( 7598 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7599 ) 7600 7601 # PZClass 7602 if ( 7603 f"{pz_prefix}Class{pzfields_sep}{profile}" 7604 in list_of_pzfields 7605 and criterion_class is not None 7606 ): 7607 sql_set.append( 7608 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7609 ) 7610 7611 # PZComment 7612 if ( 7613 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7614 in list_of_pzfields 7615 ): 7616 sql_set.append( 7617 f""" 7618 {pz_prefix}Comment{pzfields_sep}{profile} = 7619 concat( 7620 {pz_prefix}Comment{pzfields_sep}{profile}, 7621 CASE 7622 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7623 THEN ', ' 7624 ELSE '' 7625 END, 7626 '{criterion_comment}' 7627 ) 7628 """ 7629 ) 7630 7631 # PZInfos 7632 if ( 7633 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7634 in list_of_pzfields 7635 ): 7636 sql_set.append( 7637 f""" 7638 {pz_prefix}Infos{pzfields_sep}{profile} = 7639 concat( 7640 {pz_prefix}Infos{pzfields_sep}{profile}, 7641 '{criterion_infos}' 7642 ) 7643 """ 7644 ) 7645 sql_set_option = ",".join(sql_set) 7646 7647 # Criterion and comparison 7648 if sql_set_option: 7649 7650 if criterion_mode in ["operation"]: 7651 7652 try: 7653 float(criterion_value) 7654 sql_update = f""" 7655 UPDATE {table_variants} 7656 SET {sql_set_option} 7657 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7658 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7659 """ 7660 except: 7661 contains_option = "" 7662 if criterion_type == "contains": 7663 contains_option = ".*" 7664 sql_update = f""" 7665 UPDATE {table_variants} 7666 SET {sql_set_option} 7667 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7668 """ 7669 sql_queries.append(sql_update) 7670 7671 elif criterion_mode in ["sql"]: 7672 7673 sql_update = f""" 7674 UPDATE {table_variants} 7675 SET {sql_set_option} 7676 WHERE {criterion_sql} 7677 """ 7678 sql_queries.append(sql_update) 7679 7680 else: 7681 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7682 log.error(msg_err) 7683 raise ValueError(msg_err) 7684 7685 else: 7686 log.warning( 7687 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7688 ) 7689 7690 # PZTags 7691 if ( 7692 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7693 in list_of_pzfields 7694 ): 7695 7696 # Create PZFalgs value 7697 pztags_value = "" 7698 pztags_sep_default = "," 7699 pztags_sep = "" 7700 for pzfield in pzfields: 7701 if pzfield not in [f"{pz_prefix}Tags"]: 7702 if ( 7703 f"{pzfield}{pzfields_sep}{profile}" 7704 in list_of_pzfields 7705 ): 7706 if pzfield in [f"{pz_prefix}Flag"]: 7707 pztags_value += f"""{pztags_sep}{pzfield}#', 7708 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7709 THEN 'PASS' 7710 ELSE 'FILTERED' 7711 END, '""" 7712 elif pzfield in [f"{pz_prefix}Class"]: 7713 pztags_value += f"""{pztags_sep}{pzfield}#', 7714 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7715 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7716 ELSE '.' 7717 END, '""" 7718 else: 7719 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7720 pztags_sep = pztags_sep_default 7721 7722 # Add Query update for PZFlags 7723 sql_update_pztags = f""" 7724 UPDATE {table_variants} 7725 SET INFO = concat( 7726 INFO, 7727 CASE WHEN INFO NOT in ('','.') 7728 THEN ';' 7729 ELSE '' 7730 END, 7731 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7732 ) 7733 """ 7734 sql_queries.append(sql_update_pztags) 7735 7736 # Add Query update for PZFlags for default 7737 if profile == default_profile: 7738 sql_update_pztags_default = f""" 7739 UPDATE {table_variants} 7740 SET INFO = concat( 7741 INFO, 7742 ';', 7743 '{pz_prefix}Tags={pztags_value}' 7744 ) 7745 """ 7746 sql_queries.append(sql_update_pztags_default) 7747 7748 log.info(f"""Profile '{profile}' - Prioritization... """) 7749 7750 if sql_queries: 7751 7752 for sql_query in sql_queries: 7753 log.debug( 7754 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7755 ) 7756 self.conn.execute(sql_query) 7757 7758 log.info(f"""Profile '{profile}' - Update... """) 7759 sql_query_update = f""" 7760 UPDATE {table_variants} 7761 SET INFO = 7762 concat( 7763 CASE 7764 WHEN INFO NOT IN ('','.') 7765 THEN concat(INFO, ';') 7766 ELSE '' 7767 END 7768 {sql_set_info_option} 7769 ) 7770 """ 7771 self.conn.execute(sql_query_update) 7772 7773 else: 7774 7775 log.warning(f"No profiles in parameters") 7776 7777 # Remove added columns 7778 for added_column in added_columns: 7779 self.drop_column(column=added_column) 7780 7781 # Explode INFOS fields into table fields 7782 if self.get_explode_infos(): 7783 self.explode_infos( 7784 prefix=self.get_explode_infos_prefix(), 7785 fields=self.get_explode_infos_fields(), 7786 force=True, 7787 ) 7788 7789 return True 7790 7791 ### 7792 # HGVS 7793 ### 7794 7795 def annotation_hgvs(self, threads: int = None) -> None: 7796 """ 7797 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7798 coordinates and alleles. 7799 7800 :param threads: The `threads` parameter is an optional integer that specifies the number of 7801 threads to use for parallel processing. If no value is provided, it will default to the number 7802 of threads obtained from the `get_threads()` method 7803 :type threads: int 7804 """ 7805 7806 # Function for each partition of the Dask Dataframe 7807 def partition_function(partition): 7808 """ 7809 The function `partition_function` applies the `annotation_hgvs_partition` function to 7810 each row of a DataFrame called `partition`. 7811 7812 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7813 to be processed 7814 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7815 the "partition" dataframe along the axis 1. 7816 """ 7817 return partition.apply(annotation_hgvs_partition, axis=1) 7818 7819 def annotation_hgvs_partition(row) -> str: 7820 """ 7821 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7822 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7823 7824 :param row: A dictionary-like object that contains the values for the following keys: 7825 :return: a string that contains the HGVS names associated with the given row of data. 7826 """ 7827 7828 chr = row["CHROM"] 7829 pos = row["POS"] 7830 ref = row["REF"] 7831 alt = row["ALT"] 7832 7833 # Find list of associated transcripts 7834 transcripts_list = list( 7835 polars_conn.execute( 7836 f""" 7837 SELECT transcript 7838 FROM refseq_df 7839 WHERE CHROM='{chr}' 7840 AND POS={pos} 7841 """ 7842 )["transcript"] 7843 ) 7844 7845 # Full HGVS annotation in list 7846 hgvs_full_list = [] 7847 7848 for transcript_name in transcripts_list: 7849 7850 # Transcript 7851 transcript = get_transcript( 7852 transcripts=transcripts, transcript_name=transcript_name 7853 ) 7854 # Exon 7855 if use_exon: 7856 exon = transcript.find_exon_number(pos) 7857 else: 7858 exon = None 7859 # Protein 7860 transcript_protein = None 7861 if use_protein or add_protein or full_format: 7862 transcripts_protein = list( 7863 polars_conn.execute( 7864 f""" 7865 SELECT protein 7866 FROM refseqlink_df 7867 WHERE transcript='{transcript_name}' 7868 LIMIT 1 7869 """ 7870 )["protein"] 7871 ) 7872 if len(transcripts_protein): 7873 transcript_protein = transcripts_protein[0] 7874 7875 # HGVS name 7876 hgvs_name = format_hgvs_name( 7877 chr, 7878 pos, 7879 ref, 7880 alt, 7881 genome=genome, 7882 transcript=transcript, 7883 transcript_protein=transcript_protein, 7884 exon=exon, 7885 use_gene=use_gene, 7886 use_protein=use_protein, 7887 full_format=full_format, 7888 use_version=use_version, 7889 codon_type=codon_type, 7890 ) 7891 hgvs_full_list.append(hgvs_name) 7892 if add_protein and not use_protein and not full_format: 7893 hgvs_name = format_hgvs_name( 7894 chr, 7895 pos, 7896 ref, 7897 alt, 7898 genome=genome, 7899 transcript=transcript, 7900 transcript_protein=transcript_protein, 7901 exon=exon, 7902 use_gene=use_gene, 7903 use_protein=True, 7904 full_format=False, 7905 use_version=use_version, 7906 codon_type=codon_type, 7907 ) 7908 hgvs_full_list.append(hgvs_name) 7909 7910 # Create liste of HGVS annotations 7911 hgvs_full = ",".join(hgvs_full_list) 7912 7913 return hgvs_full 7914 7915 # Polars connexion 7916 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7917 7918 # Config 7919 config = self.get_config() 7920 7921 # Databases 7922 # Genome 7923 databases_genomes_folders = ( 7924 config.get("folders", {}) 7925 .get("databases", {}) 7926 .get("genomes", DEFAULT_GENOME_FOLDER) 7927 ) 7928 databases_genome = ( 7929 config.get("folders", {}).get("databases", {}).get("genomes", "") 7930 ) 7931 # refseq database folder 7932 databases_refseq_folders = ( 7933 config.get("folders", {}) 7934 .get("databases", {}) 7935 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7936 ) 7937 # refseq 7938 databases_refseq = config.get("databases", {}).get("refSeq", None) 7939 # refSeqLink 7940 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7941 7942 # Param 7943 param = self.get_param() 7944 7945 # Quick HGVS 7946 if "hgvs_options" in param and param.get("hgvs_options", ""): 7947 log.info(f"Quick HGVS Annotation:") 7948 if not param.get("hgvs", None): 7949 param["hgvs"] = {} 7950 for option in param.get("hgvs_options", "").split(","): 7951 option_var_val = option.split("=") 7952 option_var = option_var_val[0] 7953 if len(option_var_val) > 1: 7954 option_val = option_var_val[1] 7955 else: 7956 option_val = "True" 7957 if option_val.upper() in ["TRUE"]: 7958 option_val = True 7959 elif option_val.upper() in ["FALSE"]: 7960 option_val = False 7961 log.info(f" {option_var}={option_val}") 7962 param["hgvs"][option_var] = option_val 7963 7964 # Check if HGVS annotation enabled 7965 if "hgvs" in param: 7966 log.info(f"HGVS Annotation... ") 7967 for hgvs_option in param.get("hgvs", {}): 7968 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7969 else: 7970 return 7971 7972 # HGVS Param 7973 param_hgvs = param.get("hgvs", {}) 7974 use_exon = param_hgvs.get("use_exon", False) 7975 use_gene = param_hgvs.get("use_gene", False) 7976 use_protein = param_hgvs.get("use_protein", False) 7977 add_protein = param_hgvs.get("add_protein", False) 7978 full_format = param_hgvs.get("full_format", False) 7979 use_version = param_hgvs.get("use_version", False) 7980 codon_type = param_hgvs.get("codon_type", "3") 7981 7982 # refSseq refSeqLink 7983 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7984 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7985 7986 # Assembly 7987 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7988 7989 # Genome 7990 genome_file = None 7991 if find_genome(databases_genome): 7992 genome_file = find_genome(databases_genome) 7993 else: 7994 genome_file = find_genome( 7995 genome_path=databases_genomes_folders, assembly=assembly 7996 ) 7997 log.debug("Genome: " + str(genome_file)) 7998 7999 # refSseq 8000 refseq_file = find_file_prefix( 8001 input_file=databases_refseq, 8002 prefix="ncbiRefSeq", 8003 folder=databases_refseq_folders, 8004 assembly=assembly, 8005 ) 8006 log.debug("refSeq: " + str(refseq_file)) 8007 8008 # refSeqLink 8009 refseqlink_file = find_file_prefix( 8010 input_file=databases_refseqlink, 8011 prefix="ncbiRefSeqLink", 8012 folder=databases_refseq_folders, 8013 assembly=assembly, 8014 ) 8015 log.debug("refSeqLink: " + str(refseqlink_file)) 8016 8017 # Threads 8018 if not threads: 8019 threads = self.get_threads() 8020 log.debug("Threads: " + str(threads)) 8021 8022 # Variables 8023 table_variants = self.get_table_variants(clause="update") 8024 8025 # Get variants SNV and InDel only 8026 query_variants = f""" 8027 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8028 FROM {table_variants} 8029 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8030 """ 8031 df_variants = self.get_query_to_df(query_variants) 8032 8033 # Added columns 8034 added_columns = [] 8035 8036 # Add hgvs column in variants table 8037 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8038 added_column = self.add_column( 8039 table_variants, hgvs_column_name, "STRING", default_value=None 8040 ) 8041 added_columns.append(added_column) 8042 8043 log.debug(f"refSeq loading...") 8044 # refSeq in duckDB 8045 refseq_table = get_refseq_table( 8046 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8047 ) 8048 # Loading all refSeq in Dataframe 8049 refseq_query = f""" 8050 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8051 FROM {refseq_table} 8052 JOIN df_variants ON ( 8053 {refseq_table}.chrom = df_variants.CHROM 8054 AND {refseq_table}.txStart<=df_variants.POS 8055 AND {refseq_table}.txEnd>=df_variants.POS 8056 ) 8057 """ 8058 refseq_df = self.conn.query(refseq_query).pl() 8059 8060 if refseqlink_file: 8061 log.debug(f"refSeqLink loading...") 8062 # refSeqLink in duckDB 8063 refseqlink_table = get_refseq_table( 8064 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8065 ) 8066 # Loading all refSeqLink in Dataframe 8067 protacc_column = "protAcc_with_ver" 8068 mrnaacc_column = "mrnaAcc_with_ver" 8069 refseqlink_query = f""" 8070 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8071 FROM {refseqlink_table} 8072 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8073 WHERE protAcc_without_ver IS NOT NULL 8074 """ 8075 # Polars Dataframe 8076 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8077 8078 # Read RefSeq transcripts into a python dict/model. 8079 log.debug(f"Transcripts loading...") 8080 with tempfile.TemporaryDirectory() as tmpdir: 8081 transcripts_query = f""" 8082 COPY ( 8083 SELECT {refseq_table}.* 8084 FROM {refseq_table} 8085 JOIN df_variants ON ( 8086 {refseq_table}.chrom=df_variants.CHROM 8087 AND {refseq_table}.txStart<=df_variants.POS 8088 AND {refseq_table}.txEnd>=df_variants.POS 8089 ) 8090 ) 8091 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8092 """ 8093 self.conn.query(transcripts_query) 8094 with open(f"{tmpdir}/transcript.tsv") as infile: 8095 transcripts = read_transcripts(infile) 8096 8097 # Polars connexion 8098 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8099 8100 log.debug("Genome loading...") 8101 # Read genome sequence using pyfaidx. 8102 genome = Fasta(genome_file) 8103 8104 log.debug("Start annotation HGVS...") 8105 8106 # Create 8107 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8108 ddf = dd.from_pandas(df_variants, npartitions=threads) 8109 8110 # Use dask.dataframe.apply() to apply function on each partition 8111 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8112 8113 # Convert Dask DataFrame to Pandas Dataframe 8114 df = ddf.compute() 8115 8116 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8117 with tempfile.TemporaryDirectory() as tmpdir: 8118 df_parquet = os.path.join(tmpdir, "df.parquet") 8119 df.to_parquet(df_parquet) 8120 8121 # Update hgvs column 8122 update_variant_query = f""" 8123 UPDATE {table_variants} 8124 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8125 FROM read_parquet('{df_parquet}') as df 8126 WHERE variants."#CHROM" = df.CHROM 8127 AND variants.POS = df.POS 8128 AND variants.REF = df.REF 8129 AND variants.ALT = df.ALT 8130 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8131 """ 8132 self.execute_query(update_variant_query) 8133 8134 # Update INFO column 8135 sql_query_update = f""" 8136 UPDATE {table_variants} 8137 SET INFO = 8138 concat( 8139 CASE 8140 WHEN INFO NOT IN ('','.') 8141 THEN concat(INFO, ';') 8142 ELSE '' 8143 END, 8144 'hgvs=', 8145 {hgvs_column_name} 8146 ) 8147 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8148 """ 8149 self.execute_query(sql_query_update) 8150 8151 # Add header 8152 HGVS_INFOS = { 8153 "hgvs": { 8154 "ID": "hgvs", 8155 "Number": ".", 8156 "Type": "String", 8157 "Description": f"HGVS annotatation with HOWARD", 8158 } 8159 } 8160 8161 for field in HGVS_INFOS: 8162 field_ID = HGVS_INFOS[field]["ID"] 8163 field_description = HGVS_INFOS[field]["Description"] 8164 self.get_header().infos[field_ID] = vcf.parser._Info( 8165 field_ID, 8166 HGVS_INFOS[field]["Number"], 8167 HGVS_INFOS[field]["Type"], 8168 field_description, 8169 "unknown", 8170 "unknown", 8171 code_type_map[HGVS_INFOS[field]["Type"]], 8172 ) 8173 8174 # Remove added columns 8175 for added_column in added_columns: 8176 self.drop_column(column=added_column) 8177 8178 ### 8179 # Calculation 8180 ### 8181 8182 def get_operations_help( 8183 self, operations_config_dict: dict = {}, operations_config_file: str = None 8184 ) -> list: 8185 8186 # Init 8187 operations_help = [] 8188 8189 # operations 8190 operations = self.get_config_json( 8191 name="calculations", 8192 config_dict=operations_config_dict, 8193 config_file=operations_config_file, 8194 ) 8195 for op in operations: 8196 op_name = operations[op].get("name", op).upper() 8197 op_description = operations[op].get("description", op_name) 8198 op_available = operations[op].get("available", False) 8199 if op_available: 8200 operations_help.append(f" {op_name}: {op_description}") 8201 8202 # Sort operations 8203 operations_help.sort() 8204 8205 # insert header 8206 operations_help.insert(0, "Available calculation operations:") 8207 8208 # Return 8209 return operations_help 8210 8211 def calculation( 8212 self, 8213 operations: dict = {}, 8214 operations_config_dict: dict = {}, 8215 operations_config_file: str = None, 8216 ) -> None: 8217 """ 8218 It takes a list of operations, and for each operation, it checks if it's a python or sql 8219 operation, and then calls the appropriate function 8220 8221 param json example: 8222 "calculation": { 8223 "NOMEN": { 8224 "options": { 8225 "hgvs_field": "hgvs" 8226 }, 8227 "middle" : null 8228 } 8229 """ 8230 8231 # Param 8232 param = self.get_param() 8233 8234 # operations config 8235 operations_config = self.get_config_json( 8236 name="calculations", 8237 config_dict=operations_config_dict, 8238 config_file=operations_config_file, 8239 ) 8240 8241 # Upper keys 8242 operations_config = {k.upper(): v for k, v in operations_config.items()} 8243 8244 # Calculations 8245 8246 # Operations from param 8247 operations = param.get("calculation", {}).get("calculations", operations) 8248 8249 # Quick calculation - add 8250 if param.get("calculations", None): 8251 8252 # List of operations 8253 calculations_list = [ 8254 value.strip() for value in param.get("calculations", "").split(",") 8255 ] 8256 8257 # Log 8258 log.info(f"Quick Calculations:") 8259 for calculation_key in calculations_list: 8260 log.info(f" {calculation_key}") 8261 8262 # Create tmp operations (to keep operation order) 8263 operations_tmp = {} 8264 for calculation_operation in calculations_list: 8265 if calculation_operation.upper() not in operations_tmp: 8266 log.debug( 8267 f"{calculation_operation}.upper() not in {operations_tmp}" 8268 ) 8269 operations_tmp[calculation_operation.upper()] = {} 8270 add_value_into_dict( 8271 dict_tree=operations_tmp, 8272 sections=[ 8273 calculation_operation.upper(), 8274 ], 8275 value=operations.get(calculation_operation.upper(), {}), 8276 ) 8277 # Add operations already in param 8278 for calculation_operation in operations: 8279 if calculation_operation not in operations_tmp: 8280 operations_tmp[calculation_operation] = operations.get( 8281 calculation_operation, {} 8282 ) 8283 8284 # Update operations in param 8285 operations = operations_tmp 8286 8287 # Operations for calculation 8288 if not operations: 8289 operations = param.get("calculation", {}).get("calculations", {}) 8290 8291 if operations: 8292 log.info(f"Calculations...") 8293 8294 # For each operations 8295 for operation_name in operations: 8296 operation_name = operation_name.upper() 8297 if operation_name not in [""]: 8298 if operation_name in operations_config: 8299 log.info(f"Calculation '{operation_name}'") 8300 operation = operations_config[operation_name] 8301 operation_type = operation.get("type", "sql") 8302 if operation_type == "python": 8303 self.calculation_process_function( 8304 operation=operation, operation_name=operation_name 8305 ) 8306 elif operation_type == "sql": 8307 self.calculation_process_sql( 8308 operation=operation, operation_name=operation_name 8309 ) 8310 else: 8311 log.error( 8312 f"Operations config: Type '{operation_type}' NOT available" 8313 ) 8314 raise ValueError( 8315 f"Operations config: Type '{operation_type}' NOT available" 8316 ) 8317 else: 8318 log.error( 8319 f"Operations config: Calculation '{operation_name}' NOT available" 8320 ) 8321 raise ValueError( 8322 f"Operations config: Calculation '{operation_name}' NOT available" 8323 ) 8324 8325 # Explode INFOS fields into table fields 8326 if self.get_explode_infos(): 8327 self.explode_infos( 8328 prefix=self.get_explode_infos_prefix(), 8329 fields=self.get_explode_infos_fields(), 8330 force=True, 8331 ) 8332 8333 def calculation_process_sql( 8334 self, operation: dict, operation_name: str = "unknown" 8335 ) -> None: 8336 """ 8337 The `calculation_process_sql` function takes in a mathematical operation as a string and 8338 performs the operation, updating the specified table with the result. 8339 8340 :param operation: The `operation` parameter is a dictionary that contains information about the 8341 mathematical operation to be performed. It includes the following keys: 8342 :type operation: dict 8343 :param operation_name: The `operation_name` parameter is a string that represents the name of 8344 the mathematical operation being performed. It is used for logging and error handling purposes, 8345 defaults to unknown 8346 :type operation_name: str (optional) 8347 """ 8348 8349 # Operation infos 8350 operation_name = operation.get("name", "unknown") 8351 log.debug(f"process sql {operation_name}") 8352 output_column_name = operation.get("output_column_name", operation_name) 8353 output_column_type = operation.get("output_column_type", "String") 8354 prefix = operation.get("explode_infos_prefix", "") 8355 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8356 output_column_description = operation.get( 8357 "output_column_description", f"{operation_name} operation" 8358 ) 8359 operation_query = operation.get("operation_query", None) 8360 if isinstance(operation_query, list): 8361 operation_query = " ".join(operation_query) 8362 operation_info_fields = operation.get("info_fields", []) 8363 operation_info_fields_check = operation.get("info_fields_check", False) 8364 operation_info = operation.get("operation_info", True) 8365 operation_table = operation.get( 8366 "table", self.get_table_variants(clause="alter") 8367 ) 8368 8369 # table variants 8370 if operation_table: 8371 table_variants = operation_table 8372 else: 8373 table_variants = self.get_table_variants(clause="alter") 8374 8375 if operation_query: 8376 8377 # Info fields check 8378 operation_info_fields_check_result = True 8379 if operation_info_fields_check: 8380 header_infos = self.get_header().infos 8381 for info_field in operation_info_fields: 8382 operation_info_fields_check_result = ( 8383 operation_info_fields_check_result 8384 and info_field in header_infos 8385 ) 8386 8387 # If info fields available 8388 if operation_info_fields_check_result: 8389 8390 # Added_columns 8391 added_columns = [] 8392 8393 # Create VCF header field 8394 vcf_reader = self.get_header() 8395 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8396 output_column_name, 8397 ".", 8398 output_column_type, 8399 output_column_description, 8400 "howard calculation", 8401 "0", 8402 self.code_type_map.get(output_column_type), 8403 ) 8404 8405 # Explode infos if needed 8406 log.debug(f"calculation_process_sql prefix {prefix}") 8407 added_columns += self.explode_infos( 8408 prefix=prefix, 8409 fields=[output_column_name] + operation_info_fields, 8410 force=False, 8411 table=table_variants, 8412 ) 8413 8414 # Create column 8415 added_column = self.add_column( 8416 table_name=table_variants, 8417 column_name=prefix + output_column_name, 8418 column_type=output_column_type_sql, 8419 default_value="null", 8420 ) 8421 added_columns.append(added_column) 8422 8423 # Operation calculation 8424 try: 8425 8426 # Query to update calculation column 8427 sql_update = f""" 8428 UPDATE {table_variants} 8429 SET "{prefix}{output_column_name}" = ({operation_query}) 8430 """ 8431 self.conn.execute(sql_update) 8432 8433 # Add to INFO 8434 if operation_info: 8435 sql_update_info = f""" 8436 UPDATE {table_variants} 8437 SET "INFO" = 8438 concat( 8439 CASE 8440 WHEN "INFO" IS NOT NULL 8441 THEN concat("INFO", ';') 8442 ELSE '' 8443 END, 8444 '{output_column_name}=', 8445 "{prefix}{output_column_name}" 8446 ) 8447 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8448 """ 8449 self.conn.execute(sql_update_info) 8450 8451 except: 8452 log.error( 8453 f"Operations config: Calculation '{operation_name}' query failed" 8454 ) 8455 raise ValueError( 8456 f"Operations config: Calculation '{operation_name}' query failed" 8457 ) 8458 8459 # Remove added columns 8460 for added_column in added_columns: 8461 log.debug(f"added_column: {added_column}") 8462 self.drop_column(column=added_column) 8463 8464 else: 8465 log.error( 8466 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8467 ) 8468 raise ValueError( 8469 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8470 ) 8471 8472 else: 8473 log.error( 8474 f"Operations config: Calculation '{operation_name}' query NOT defined" 8475 ) 8476 raise ValueError( 8477 f"Operations config: Calculation '{operation_name}' query NOT defined" 8478 ) 8479 8480 def calculation_process_function( 8481 self, operation: dict, operation_name: str = "unknown" 8482 ) -> None: 8483 """ 8484 The `calculation_process_function` takes in an operation dictionary and performs the specified 8485 function with the given parameters. 8486 8487 :param operation: The `operation` parameter is a dictionary that contains information about the 8488 operation to be performed. It has the following keys: 8489 :type operation: dict 8490 :param operation_name: The `operation_name` parameter is a string that represents the name of 8491 the operation being performed. It is used for logging purposes, defaults to unknown 8492 :type operation_name: str (optional) 8493 """ 8494 8495 operation_name = operation["name"] 8496 log.debug(f"process sql {operation_name}") 8497 function_name = operation["function_name"] 8498 function_params = operation["function_params"] 8499 getattr(self, function_name)(*function_params) 8500 8501 def calculation_variant_id(self) -> None: 8502 """ 8503 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8504 updates the INFO field of a variants table with the variant ID. 8505 """ 8506 8507 # variant_id annotation field 8508 variant_id_tag = self.get_variant_id_column() 8509 added_columns = [variant_id_tag] 8510 8511 # variant_id hgvs tags" 8512 vcf_infos_tags = { 8513 variant_id_tag: "howard variant ID annotation", 8514 } 8515 8516 # Variants table 8517 table_variants = self.get_table_variants() 8518 8519 # Header 8520 vcf_reader = self.get_header() 8521 8522 # Add variant_id to header 8523 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8524 variant_id_tag, 8525 ".", 8526 "String", 8527 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8528 "howard calculation", 8529 "0", 8530 self.code_type_map.get("String"), 8531 ) 8532 8533 # Update 8534 sql_update = f""" 8535 UPDATE {table_variants} 8536 SET "INFO" = 8537 concat( 8538 CASE 8539 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8540 THEN '' 8541 ELSE concat("INFO", ';') 8542 END, 8543 '{variant_id_tag}=', 8544 "{variant_id_tag}" 8545 ) 8546 """ 8547 self.conn.execute(sql_update) 8548 8549 # Remove added columns 8550 for added_column in added_columns: 8551 self.drop_column(column=added_column) 8552 8553 def calculation_extract_snpeff_hgvs( 8554 self, 8555 snpeff_hgvs: str = "snpeff_hgvs", 8556 snpeff_field: str = "ANN", 8557 ) -> None: 8558 """ 8559 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8560 annotation field in a VCF file and adds them as a new column in the variants table. 8561 8562 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8563 function is used to specify the name of the column that will store the HGVS nomenclatures 8564 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8565 snpeff_hgvs 8566 :type snpeff_hgvs: str (optional) 8567 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8568 function represents the field in the VCF file that contains SnpEff annotations. This field is 8569 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8570 to ANN 8571 :type snpeff_field: str (optional) 8572 """ 8573 8574 # Snpeff hgvs tags 8575 vcf_infos_tags = { 8576 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8577 } 8578 8579 # Prefix 8580 prefix = self.get_explode_infos_prefix() 8581 if prefix: 8582 prefix = "INFO/" 8583 8584 # snpEff fields 8585 speff_ann_infos = prefix + snpeff_field 8586 speff_hgvs_infos = prefix + snpeff_hgvs 8587 8588 # Variants table 8589 table_variants = self.get_table_variants() 8590 8591 # Header 8592 vcf_reader = self.get_header() 8593 8594 # Add columns 8595 added_columns = [] 8596 8597 # Explode HGVS field in column 8598 added_columns += self.explode_infos(fields=[snpeff_field]) 8599 8600 if snpeff_field in vcf_reader.infos: 8601 8602 log.debug(vcf_reader.infos[snpeff_field]) 8603 8604 # Extract ANN header 8605 ann_description = vcf_reader.infos[snpeff_field].desc 8606 pattern = r"'(.+?)'" 8607 match = re.search(pattern, ann_description) 8608 if match: 8609 ann_header_match = match.group(1).split(" | ") 8610 ann_header_desc = {} 8611 for i in range(len(ann_header_match)): 8612 ann_header_info = "".join( 8613 char for char in ann_header_match[i] if char.isalnum() 8614 ) 8615 ann_header_desc[ann_header_info] = ann_header_match[i] 8616 if not ann_header_desc: 8617 raise ValueError("Invalid header description format") 8618 else: 8619 raise ValueError("Invalid header description format") 8620 8621 # Create variant id 8622 variant_id_column = self.get_variant_id_column() 8623 added_columns += [variant_id_column] 8624 8625 # Create dataframe 8626 dataframe_snpeff_hgvs = self.get_query_to_df( 8627 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8628 ) 8629 8630 # Create main NOMEN column 8631 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8632 speff_ann_infos 8633 ].apply( 8634 lambda x: extract_snpeff_hgvs( 8635 str(x), header=list(ann_header_desc.values()) 8636 ) 8637 ) 8638 8639 # Add snpeff_hgvs to header 8640 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8641 snpeff_hgvs, 8642 ".", 8643 "String", 8644 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8645 "howard calculation", 8646 "0", 8647 self.code_type_map.get("String"), 8648 ) 8649 8650 # Update 8651 sql_update = f""" 8652 UPDATE variants 8653 SET "INFO" = 8654 concat( 8655 CASE 8656 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8657 THEN '' 8658 ELSE concat("INFO", ';') 8659 END, 8660 CASE 8661 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8662 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8663 THEN concat( 8664 '{snpeff_hgvs}=', 8665 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8666 ) 8667 ELSE '' 8668 END 8669 ) 8670 FROM dataframe_snpeff_hgvs 8671 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8672 8673 """ 8674 self.conn.execute(sql_update) 8675 8676 # Delete dataframe 8677 del dataframe_snpeff_hgvs 8678 gc.collect() 8679 8680 else: 8681 8682 log.warning( 8683 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8684 ) 8685 8686 # Remove added columns 8687 for added_column in added_columns: 8688 self.drop_column(column=added_column) 8689 8690 def calculation_snpeff_ann_explode( 8691 self, 8692 uniquify: bool = True, 8693 output_format: str = "fields", 8694 output_prefix: str = "snpeff_", 8695 snpeff_field: str = "ANN", 8696 ) -> None: 8697 """ 8698 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8699 exploding the HGVS field and updating variant information accordingly. 8700 8701 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8702 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8703 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8704 defaults to True 8705 :type uniquify: bool (optional) 8706 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8707 function specifies the format in which the output annotations will be generated. It has a 8708 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8709 format, defaults to fields 8710 :type output_format: str (optional) 8711 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8712 method is used to specify the prefix that will be added to the output annotations generated 8713 during the calculation process. This prefix helps to differentiate the newly added annotations 8714 from existing ones in the output data. By default, the, defaults to ANN_ 8715 :type output_prefix: str (optional) 8716 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8717 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8718 field will be processed to explode the HGVS annotations and update the variant information 8719 accordingly, defaults to ANN 8720 :type snpeff_field: str (optional) 8721 """ 8722 8723 # SnpEff annotation field 8724 snpeff_hgvs = "snpeff_ann_explode" 8725 8726 # Snpeff hgvs tags 8727 vcf_infos_tags = { 8728 snpeff_hgvs: "Explode snpEff annotations", 8729 } 8730 8731 # Prefix 8732 prefix = self.get_explode_infos_prefix() 8733 if prefix: 8734 prefix = "INFO/" 8735 8736 # snpEff fields 8737 speff_ann_infos = prefix + snpeff_field 8738 speff_hgvs_infos = prefix + snpeff_hgvs 8739 8740 # Variants table 8741 table_variants = self.get_table_variants() 8742 8743 # Header 8744 vcf_reader = self.get_header() 8745 8746 # Add columns 8747 added_columns = [] 8748 8749 # Explode HGVS field in column 8750 added_columns += self.explode_infos(fields=[snpeff_field]) 8751 log.debug(f"snpeff_field={snpeff_field}") 8752 log.debug(f"added_columns={added_columns}") 8753 8754 if snpeff_field in vcf_reader.infos: 8755 8756 # Extract ANN header 8757 ann_description = vcf_reader.infos[snpeff_field].desc 8758 pattern = r"'(.+?)'" 8759 match = re.search(pattern, ann_description) 8760 if match: 8761 ann_header_match = match.group(1).split(" | ") 8762 ann_header = [] 8763 ann_header_desc = {} 8764 for i in range(len(ann_header_match)): 8765 ann_header_info = "".join( 8766 char for char in ann_header_match[i] if char.isalnum() 8767 ) 8768 ann_header.append(ann_header_info) 8769 ann_header_desc[ann_header_info] = ann_header_match[i] 8770 if not ann_header_desc: 8771 raise ValueError("Invalid header description format") 8772 else: 8773 raise ValueError("Invalid header description format") 8774 8775 # Create variant id 8776 variant_id_column = self.get_variant_id_column() 8777 added_columns += [variant_id_column] 8778 8779 # Create dataframe 8780 dataframe_snpeff_hgvs = self.get_query_to_df( 8781 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8782 ) 8783 8784 # Create snpEff columns 8785 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8786 speff_ann_infos 8787 ].apply( 8788 lambda x: explode_snpeff_ann( 8789 str(x), 8790 uniquify=uniquify, 8791 output_format=output_format, 8792 prefix=output_prefix, 8793 header=list(ann_header_desc.values()), 8794 ) 8795 ) 8796 8797 # Header 8798 ann_annotations_prefix = "" 8799 if output_format.upper() in ["JSON"]: 8800 ann_annotations_prefix = f"{output_prefix}=" 8801 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8802 output_prefix, 8803 ".", 8804 "String", 8805 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8806 + " - JSON format", 8807 "howard calculation", 8808 "0", 8809 self.code_type_map.get("String"), 8810 ) 8811 else: 8812 for ann_annotation in ann_header: 8813 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8814 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8815 ann_annotation_id, 8816 ".", 8817 "String", 8818 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8819 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8820 "howard calculation", 8821 "0", 8822 self.code_type_map.get("String"), 8823 ) 8824 8825 # Update 8826 sql_update = f""" 8827 UPDATE variants 8828 SET "INFO" = 8829 concat( 8830 CASE 8831 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8832 THEN '' 8833 ELSE concat("INFO", ';') 8834 END, 8835 CASE 8836 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8837 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8838 THEN concat( 8839 '{ann_annotations_prefix}', 8840 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8841 ) 8842 ELSE '' 8843 END 8844 ) 8845 FROM dataframe_snpeff_hgvs 8846 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8847 8848 """ 8849 self.conn.execute(sql_update) 8850 8851 # Delete dataframe 8852 del dataframe_snpeff_hgvs 8853 gc.collect() 8854 8855 else: 8856 8857 log.warning( 8858 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8859 ) 8860 8861 # Remove added columns 8862 for added_column in added_columns: 8863 self.drop_column(column=added_column) 8864 8865 def calculation_extract_nomen(self) -> None: 8866 """ 8867 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8868 """ 8869 8870 # NOMEN field 8871 field_nomen_dict = "NOMEN_DICT" 8872 8873 # NOMEN structure 8874 nomen_dict = { 8875 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8876 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8877 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8878 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8879 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8880 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8881 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8882 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8883 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8884 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8885 } 8886 8887 # Param 8888 param = self.get_param() 8889 8890 # Prefix 8891 prefix = self.get_explode_infos_prefix() 8892 8893 # Header 8894 vcf_reader = self.get_header() 8895 8896 # Added columns 8897 added_columns = [] 8898 8899 # Get HGVS field 8900 hgvs_field = ( 8901 param.get("calculation", {}) 8902 .get("calculations", {}) 8903 .get("NOMEN", {}) 8904 .get("options", {}) 8905 .get("hgvs_field", "hgvs") 8906 ) 8907 8908 # Get NOMEN pattern 8909 nomen_pattern = ( 8910 param.get("calculation", {}) 8911 .get("calculations", {}) 8912 .get("NOMEN", {}) 8913 .get("options", {}) 8914 .get("pattern", None) 8915 ) 8916 8917 # transcripts list of preference sources 8918 transcripts_sources = {} 8919 8920 # Get transcripts 8921 transcripts_file = ( 8922 param.get("calculation", {}) 8923 .get("calculations", {}) 8924 .get("NOMEN", {}) 8925 .get("options", {}) 8926 .get("transcripts", None) 8927 ) 8928 transcripts_file = full_path(transcripts_file) 8929 if transcripts_file: 8930 if os.path.exists(transcripts_file): 8931 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8932 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8933 transcripts_sources["file"] = transcripts_from_file 8934 else: 8935 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8936 log.error(msg_err) 8937 raise ValueError(msg_err) 8938 8939 # Get transcripts table 8940 transcripts_table = ( 8941 param.get("calculation", {}) 8942 .get("calculations", {}) 8943 .get("NOMEN", {}) 8944 .get("options", {}) 8945 .get("transcripts_table", self.get_table_variants()) 8946 ) 8947 # Get transcripts column 8948 transcripts_column = ( 8949 param.get("calculation", {}) 8950 .get("calculations", {}) 8951 .get("NOMEN", {}) 8952 .get("options", {}) 8953 .get("transcripts_column", None) 8954 ) 8955 8956 if transcripts_table and transcripts_column: 8957 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8958 # Explode if not exists 8959 self.explode_infos(fields=[transcripts_column], table=transcripts_table) 8960 else: 8961 extra_field_transcript = f"NULL" 8962 8963 # Transcripts of preference source order 8964 transcripts_order = ( 8965 param.get("calculation", {}) 8966 .get("calculations", {}) 8967 .get("NOMEN", {}) 8968 .get("options", {}) 8969 .get("transcripts_order", ["column", "file"]) 8970 ) 8971 8972 # Transcripts from file 8973 transcripts = transcripts_sources.get("file", []) 8974 8975 # Explode HGVS field in column 8976 added_columns += self.explode_infos(fields=[hgvs_field]) 8977 8978 # extra infos 8979 extra_infos = self.get_extra_infos() 8980 extra_field = prefix + hgvs_field 8981 8982 if extra_field in extra_infos: 8983 8984 # Create dataframe 8985 dataframe_hgvs = self.get_query_to_df( 8986 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 8987 ) 8988 8989 # Create main NOMEN column 8990 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 8991 lambda x: find_nomen( 8992 hgvs=x.hgvs, 8993 transcript=x.transcript, 8994 transcripts=transcripts, 8995 pattern=nomen_pattern, 8996 transcripts_source_order=transcripts_order, 8997 ), 8998 axis=1, 8999 ) 9000 9001 # Explode NOMEN Structure and create SQL set for update 9002 sql_nomen_fields = [] 9003 for nomen_field in nomen_dict: 9004 9005 # Explode each field into a column 9006 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 9007 lambda x: dict(x).get(nomen_field, "") 9008 ) 9009 9010 # Create VCF header field 9011 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9012 nomen_field, 9013 ".", 9014 "String", 9015 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9016 "howard calculation", 9017 "0", 9018 self.code_type_map.get("String"), 9019 ) 9020 sql_nomen_fields.append( 9021 f""" 9022 CASE 9023 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 9024 THEN concat( 9025 ';{nomen_field}=', 9026 dataframe_hgvs."{nomen_field}" 9027 ) 9028 ELSE '' 9029 END 9030 """ 9031 ) 9032 9033 # SQL set for update 9034 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9035 9036 # Update 9037 sql_update = f""" 9038 UPDATE variants 9039 SET "INFO" = 9040 concat( 9041 CASE 9042 WHEN "INFO" IS NULL 9043 THEN '' 9044 ELSE "INFO" 9045 END, 9046 {sql_nomen_fields_set} 9047 ) 9048 FROM dataframe_hgvs 9049 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9050 AND variants."POS" = dataframe_hgvs."POS" 9051 AND variants."REF" = dataframe_hgvs."REF" 9052 AND variants."ALT" = dataframe_hgvs."ALT" 9053 """ 9054 self.conn.execute(sql_update) 9055 9056 # Delete dataframe 9057 del dataframe_hgvs 9058 gc.collect() 9059 9060 # Remove added columns 9061 for added_column in added_columns: 9062 self.drop_column(column=added_column) 9063 9064 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9065 """ 9066 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9067 pipeline/sample for a variant and updates the variant information in a VCF file. 9068 9069 :param tag: The `tag` parameter is a string that represents the annotation field for the 9070 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9071 VCF header and to update the corresponding field in the variants table, defaults to 9072 findbypipeline 9073 :type tag: str (optional) 9074 """ 9075 9076 # if FORMAT and samples 9077 if ( 9078 "FORMAT" in self.get_header_columns_as_list() 9079 and self.get_header_sample_list() 9080 ): 9081 9082 # findbypipeline annotation field 9083 findbypipeline_tag = tag 9084 9085 # VCF infos tags 9086 vcf_infos_tags = { 9087 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9088 } 9089 9090 # Prefix 9091 prefix = self.get_explode_infos_prefix() 9092 9093 # Field 9094 findbypipeline_infos = prefix + findbypipeline_tag 9095 9096 # Variants table 9097 table_variants = self.get_table_variants() 9098 9099 # Header 9100 vcf_reader = self.get_header() 9101 9102 # Create variant id 9103 variant_id_column = self.get_variant_id_column() 9104 added_columns = [variant_id_column] 9105 9106 # variant_id, FORMAT and samples 9107 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9108 self.get_header_sample_list() 9109 ) 9110 9111 # Create dataframe 9112 dataframe_findbypipeline = self.get_query_to_df( 9113 f""" SELECT {samples_fields} FROM {table_variants} """ 9114 ) 9115 9116 # Create findbypipeline column 9117 dataframe_findbypipeline[findbypipeline_infos] = ( 9118 dataframe_findbypipeline.apply( 9119 lambda row: findbypipeline( 9120 row, samples=self.get_header_sample_list() 9121 ), 9122 axis=1, 9123 ) 9124 ) 9125 9126 # Add snpeff_hgvs to header 9127 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9128 findbypipeline_tag, 9129 ".", 9130 "String", 9131 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9132 "howard calculation", 9133 "0", 9134 self.code_type_map.get("String"), 9135 ) 9136 9137 # Update 9138 sql_update = f""" 9139 UPDATE variants 9140 SET "INFO" = 9141 concat( 9142 CASE 9143 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9144 THEN '' 9145 ELSE concat("INFO", ';') 9146 END, 9147 CASE 9148 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9149 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9150 THEN concat( 9151 '{findbypipeline_tag}=', 9152 dataframe_findbypipeline."{findbypipeline_infos}" 9153 ) 9154 ELSE '' 9155 END 9156 ) 9157 FROM dataframe_findbypipeline 9158 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9159 """ 9160 self.conn.execute(sql_update) 9161 9162 # Remove added columns 9163 for added_column in added_columns: 9164 self.drop_column(column=added_column) 9165 9166 # Delete dataframe 9167 del dataframe_findbypipeline 9168 gc.collect() 9169 9170 def calculation_genotype_concordance(self) -> None: 9171 """ 9172 The function `calculation_genotype_concordance` calculates the genotype concordance for 9173 multi-caller VCF files and updates the variant information in the database. 9174 """ 9175 9176 # if FORMAT and samples 9177 if ( 9178 "FORMAT" in self.get_header_columns_as_list() 9179 and self.get_header_sample_list() 9180 ): 9181 9182 # genotypeconcordance annotation field 9183 genotypeconcordance_tag = "genotypeconcordance" 9184 9185 # VCF infos tags 9186 vcf_infos_tags = { 9187 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9188 } 9189 9190 # Prefix 9191 prefix = self.get_explode_infos_prefix() 9192 9193 # Field 9194 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9195 9196 # Variants table 9197 table_variants = self.get_table_variants() 9198 9199 # Header 9200 vcf_reader = self.get_header() 9201 9202 # Create variant id 9203 variant_id_column = self.get_variant_id_column() 9204 added_columns = [variant_id_column] 9205 9206 # variant_id, FORMAT and samples 9207 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9208 self.get_header_sample_list() 9209 ) 9210 9211 # Create dataframe 9212 dataframe_genotypeconcordance = self.get_query_to_df( 9213 f""" SELECT {samples_fields} FROM {table_variants} """ 9214 ) 9215 9216 # Create genotypeconcordance column 9217 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9218 dataframe_genotypeconcordance.apply( 9219 lambda row: genotypeconcordance( 9220 row, samples=self.get_header_sample_list() 9221 ), 9222 axis=1, 9223 ) 9224 ) 9225 9226 # Add genotypeconcordance to header 9227 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9228 genotypeconcordance_tag, 9229 ".", 9230 "String", 9231 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9232 "howard calculation", 9233 "0", 9234 self.code_type_map.get("String"), 9235 ) 9236 9237 # Update 9238 sql_update = f""" 9239 UPDATE variants 9240 SET "INFO" = 9241 concat( 9242 CASE 9243 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9244 THEN '' 9245 ELSE concat("INFO", ';') 9246 END, 9247 CASE 9248 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9249 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9250 THEN concat( 9251 '{genotypeconcordance_tag}=', 9252 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9253 ) 9254 ELSE '' 9255 END 9256 ) 9257 FROM dataframe_genotypeconcordance 9258 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9259 """ 9260 self.conn.execute(sql_update) 9261 9262 # Remove added columns 9263 for added_column in added_columns: 9264 self.drop_column(column=added_column) 9265 9266 # Delete dataframe 9267 del dataframe_genotypeconcordance 9268 gc.collect() 9269 9270 def calculation_barcode(self, tag: str = "barcode") -> None: 9271 """ 9272 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9273 updates the INFO field in the file with the calculated barcode values. 9274 9275 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9276 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9277 the default tag name is set to "barcode", defaults to barcode 9278 :type tag: str (optional) 9279 """ 9280 9281 # if FORMAT and samples 9282 if ( 9283 "FORMAT" in self.get_header_columns_as_list() 9284 and self.get_header_sample_list() 9285 ): 9286 9287 # barcode annotation field 9288 if not tag: 9289 tag = "barcode" 9290 9291 # VCF infos tags 9292 vcf_infos_tags = { 9293 tag: "barcode calculation (VaRank)", 9294 } 9295 9296 # Prefix 9297 prefix = self.get_explode_infos_prefix() 9298 9299 # Field 9300 barcode_infos = prefix + tag 9301 9302 # Variants table 9303 table_variants = self.get_table_variants() 9304 9305 # Header 9306 vcf_reader = self.get_header() 9307 9308 # Create variant id 9309 variant_id_column = self.get_variant_id_column() 9310 added_columns = [variant_id_column] 9311 9312 # variant_id, FORMAT and samples 9313 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9314 self.get_header_sample_list() 9315 ) 9316 9317 # Create dataframe 9318 dataframe_barcode = self.get_query_to_df( 9319 f""" SELECT {samples_fields} FROM {table_variants} """ 9320 ) 9321 9322 # Create barcode column 9323 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9324 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9325 ) 9326 9327 # Add barcode to header 9328 vcf_reader.infos[tag] = vcf.parser._Info( 9329 tag, 9330 ".", 9331 "String", 9332 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9333 "howard calculation", 9334 "0", 9335 self.code_type_map.get("String"), 9336 ) 9337 9338 # Update 9339 sql_update = f""" 9340 UPDATE {table_variants} 9341 SET "INFO" = 9342 concat( 9343 CASE 9344 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9345 THEN '' 9346 ELSE concat("INFO", ';') 9347 END, 9348 CASE 9349 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9350 AND dataframe_barcode."{barcode_infos}" NOT NULL 9351 THEN concat( 9352 '{tag}=', 9353 dataframe_barcode."{barcode_infos}" 9354 ) 9355 ELSE '' 9356 END 9357 ) 9358 FROM dataframe_barcode 9359 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9360 """ 9361 self.conn.execute(sql_update) 9362 9363 # Remove added columns 9364 for added_column in added_columns: 9365 self.drop_column(column=added_column) 9366 9367 # Delete dataframe 9368 del dataframe_barcode 9369 gc.collect() 9370 9371 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9372 """ 9373 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9374 and updates the INFO field in the file with the calculated barcode values. 9375 9376 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9377 the barcode tag that will be added to the VCF file during the calculation process. If no value 9378 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9379 :type tag: str (optional) 9380 """ 9381 9382 # if FORMAT and samples 9383 if ( 9384 "FORMAT" in self.get_header_columns_as_list() 9385 and self.get_header_sample_list() 9386 ): 9387 9388 # barcode annotation field 9389 if not tag: 9390 tag = "BCF" 9391 9392 # VCF infos tags 9393 vcf_infos_tags = { 9394 tag: "barcode family calculation", 9395 f"{tag}S": "barcode family samples", 9396 } 9397 9398 # Param 9399 param = self.get_param() 9400 log.debug(f"param={param}") 9401 9402 # Prefix 9403 prefix = self.get_explode_infos_prefix() 9404 9405 # PED param 9406 ped = ( 9407 param.get("calculation", {}) 9408 .get("calculations", {}) 9409 .get("BARCODEFAMILY", {}) 9410 .get("family_pedigree", None) 9411 ) 9412 log.debug(f"ped={ped}") 9413 9414 # Load PED 9415 if ped: 9416 9417 # Pedigree is a file 9418 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9419 log.debug("Pedigree is file") 9420 with open(full_path(ped)) as ped: 9421 ped = json.load(ped) 9422 9423 # Pedigree is a string 9424 elif isinstance(ped, str): 9425 log.debug("Pedigree is str") 9426 try: 9427 ped = json.loads(ped) 9428 log.debug("Pedigree is json str") 9429 except ValueError as e: 9430 ped_samples = ped.split(",") 9431 ped = {} 9432 for ped_sample in ped_samples: 9433 ped[ped_sample] = ped_sample 9434 9435 # Pedigree is a dict 9436 elif isinstance(ped, dict): 9437 log.debug("Pedigree is dict") 9438 9439 # Pedigree is not well formatted 9440 else: 9441 msg_error = "Pedigree not well formatted" 9442 log.error(msg_error) 9443 raise ValueError(msg_error) 9444 9445 # Construct list 9446 ped_samples = list(ped.values()) 9447 9448 else: 9449 log.debug("Pedigree not defined. Take all samples") 9450 ped_samples = self.get_header_sample_list() 9451 ped = {} 9452 for ped_sample in ped_samples: 9453 ped[ped_sample] = ped_sample 9454 9455 # Check pedigree 9456 if not ped or len(ped) == 0: 9457 msg_error = f"Error in pedigree: samples {ped_samples}" 9458 log.error(msg_error) 9459 raise ValueError(msg_error) 9460 9461 # Log 9462 log.info( 9463 "Calculation 'BARCODEFAMILY' - Samples: " 9464 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9465 ) 9466 log.debug(f"ped_samples={ped_samples}") 9467 9468 # Field 9469 barcode_infos = prefix + tag 9470 9471 # Variants table 9472 table_variants = self.get_table_variants() 9473 9474 # Header 9475 vcf_reader = self.get_header() 9476 9477 # Create variant id 9478 variant_id_column = self.get_variant_id_column() 9479 added_columns = [variant_id_column] 9480 9481 # variant_id, FORMAT and samples 9482 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9483 ped_samples 9484 ) 9485 9486 # Create dataframe 9487 dataframe_barcode = self.get_query_to_df( 9488 f""" SELECT {samples_fields} FROM {table_variants} """ 9489 ) 9490 9491 # Create barcode column 9492 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9493 lambda row: barcode(row, samples=ped_samples), axis=1 9494 ) 9495 9496 # Add barcode family to header 9497 # Add vaf_normalization to header 9498 vcf_reader.formats[tag] = vcf.parser._Format( 9499 id=tag, 9500 num=".", 9501 type="String", 9502 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9503 type_code=self.code_type_map.get("String"), 9504 ) 9505 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9506 id=f"{tag}S", 9507 num=".", 9508 type="String", 9509 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9510 type_code=self.code_type_map.get("String"), 9511 ) 9512 9513 # Update 9514 # for sample in ped_samples: 9515 sql_update_set = [] 9516 for sample in self.get_header_sample_list() + ["FORMAT"]: 9517 if sample in ped_samples: 9518 value = f'dataframe_barcode."{barcode_infos}"' 9519 value_samples = "'" + ",".join(ped_samples) + "'" 9520 elif sample == "FORMAT": 9521 value = f"'{tag}'" 9522 value_samples = f"'{tag}S'" 9523 else: 9524 value = "'.'" 9525 value_samples = "'.'" 9526 format_regex = r"[a-zA-Z0-9\s]" 9527 sql_update_set.append( 9528 f""" 9529 "{sample}" = 9530 concat( 9531 CASE 9532 WHEN {table_variants}."{sample}" = './.' 9533 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9534 ELSE {table_variants}."{sample}" 9535 END, 9536 ':', 9537 {value}, 9538 ':', 9539 {value_samples} 9540 ) 9541 """ 9542 ) 9543 9544 sql_update_set_join = ", ".join(sql_update_set) 9545 sql_update = f""" 9546 UPDATE {table_variants} 9547 SET {sql_update_set_join} 9548 FROM dataframe_barcode 9549 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9550 """ 9551 self.conn.execute(sql_update) 9552 9553 # Remove added columns 9554 for added_column in added_columns: 9555 self.drop_column(column=added_column) 9556 9557 # Delete dataframe 9558 del dataframe_barcode 9559 gc.collect() 9560 9561 def calculation_trio(self) -> None: 9562 """ 9563 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9564 information to the INFO field of each variant. 9565 """ 9566 9567 # if FORMAT and samples 9568 if ( 9569 "FORMAT" in self.get_header_columns_as_list() 9570 and self.get_header_sample_list() 9571 ): 9572 9573 # trio annotation field 9574 trio_tag = "trio" 9575 9576 # VCF infos tags 9577 vcf_infos_tags = { 9578 "trio": "trio calculation", 9579 } 9580 9581 # Param 9582 param = self.get_param() 9583 9584 # Prefix 9585 prefix = self.get_explode_infos_prefix() 9586 9587 # Trio param 9588 trio_ped = ( 9589 param.get("calculation", {}) 9590 .get("calculations", {}) 9591 .get("TRIO", {}) 9592 .get("trio_pedigree", None) 9593 ) 9594 9595 # Load trio 9596 if trio_ped: 9597 9598 # Trio pedigree is a file 9599 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9600 log.debug("TRIO pedigree is file") 9601 with open(full_path(trio_ped)) as trio_ped: 9602 trio_ped = json.load(trio_ped) 9603 9604 # Trio pedigree is a string 9605 elif isinstance(trio_ped, str): 9606 log.debug("TRIO pedigree is str") 9607 try: 9608 trio_ped = json.loads(trio_ped) 9609 log.debug("TRIO pedigree is json str") 9610 except ValueError as e: 9611 trio_samples = trio_ped.split(",") 9612 if len(trio_samples) == 3: 9613 trio_ped = { 9614 "father": trio_samples[0], 9615 "mother": trio_samples[1], 9616 "child": trio_samples[2], 9617 } 9618 log.debug("TRIO pedigree is list str") 9619 else: 9620 msg_error = "TRIO pedigree not well formatted" 9621 log.error(msg_error) 9622 raise ValueError(msg_error) 9623 9624 # Trio pedigree is a dict 9625 elif isinstance(trio_ped, dict): 9626 log.debug("TRIO pedigree is dict") 9627 9628 # Trio pedigree is not well formatted 9629 else: 9630 msg_error = "TRIO pedigree not well formatted" 9631 log.error(msg_error) 9632 raise ValueError(msg_error) 9633 9634 # Construct trio list 9635 trio_samples = [ 9636 trio_ped.get("father", ""), 9637 trio_ped.get("mother", ""), 9638 trio_ped.get("child", ""), 9639 ] 9640 9641 else: 9642 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9643 samples_list = self.get_header_sample_list() 9644 if len(samples_list) >= 3: 9645 trio_samples = self.get_header_sample_list()[0:3] 9646 trio_ped = { 9647 "father": trio_samples[0], 9648 "mother": trio_samples[1], 9649 "child": trio_samples[2], 9650 } 9651 else: 9652 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9653 log.error(msg_error) 9654 raise ValueError(msg_error) 9655 9656 # Check trio pedigree 9657 if not trio_ped or len(trio_ped) != 3: 9658 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9659 log.error(msg_error) 9660 raise ValueError(msg_error) 9661 9662 # Log 9663 log.info( 9664 f"Calculation 'TRIO' - Samples: " 9665 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9666 ) 9667 9668 # Field 9669 trio_infos = prefix + trio_tag 9670 9671 # Variants table 9672 table_variants = self.get_table_variants() 9673 9674 # Header 9675 vcf_reader = self.get_header() 9676 9677 # Create variant id 9678 variant_id_column = self.get_variant_id_column() 9679 added_columns = [variant_id_column] 9680 9681 # variant_id, FORMAT and samples 9682 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9683 self.get_header_sample_list() 9684 ) 9685 9686 # Create dataframe 9687 dataframe_trio = self.get_query_to_df( 9688 f""" SELECT {samples_fields} FROM {table_variants} """ 9689 ) 9690 9691 # Create trio column 9692 dataframe_trio[trio_infos] = dataframe_trio.apply( 9693 lambda row: trio(row, samples=trio_samples), axis=1 9694 ) 9695 9696 # Add trio to header 9697 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9698 trio_tag, 9699 ".", 9700 "String", 9701 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9702 "howard calculation", 9703 "0", 9704 self.code_type_map.get("String"), 9705 ) 9706 9707 # Update 9708 sql_update = f""" 9709 UPDATE {table_variants} 9710 SET "INFO" = 9711 concat( 9712 CASE 9713 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9714 THEN '' 9715 ELSE concat("INFO", ';') 9716 END, 9717 CASE 9718 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9719 AND dataframe_trio."{trio_infos}" NOT NULL 9720 THEN concat( 9721 '{trio_tag}=', 9722 dataframe_trio."{trio_infos}" 9723 ) 9724 ELSE '' 9725 END 9726 ) 9727 FROM dataframe_trio 9728 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9729 """ 9730 self.conn.execute(sql_update) 9731 9732 # Remove added columns 9733 for added_column in added_columns: 9734 self.drop_column(column=added_column) 9735 9736 # Delete dataframe 9737 del dataframe_trio 9738 gc.collect() 9739 9740 def calculation_vaf_normalization(self) -> None: 9741 """ 9742 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9743 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9744 :return: The function does not return anything. 9745 """ 9746 9747 # if FORMAT and samples 9748 if ( 9749 "FORMAT" in self.get_header_columns_as_list() 9750 and self.get_header_sample_list() 9751 ): 9752 9753 # vaf_normalization annotation field 9754 vaf_normalization_tag = "VAF" 9755 9756 # VCF infos tags 9757 vcf_infos_tags = { 9758 "VAF": "VAF Variant Frequency", 9759 } 9760 9761 # Prefix 9762 prefix = self.get_explode_infos_prefix() 9763 9764 # Variants table 9765 table_variants = self.get_table_variants() 9766 9767 # Header 9768 vcf_reader = self.get_header() 9769 9770 # Do not calculate if VAF already exists 9771 if "VAF" in vcf_reader.formats: 9772 log.debug("VAF already on genotypes") 9773 return 9774 9775 # Create variant id 9776 variant_id_column = self.get_variant_id_column() 9777 added_columns = [variant_id_column] 9778 9779 # variant_id, FORMAT and samples 9780 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9781 f""" "{sample}" """ for sample in self.get_header_sample_list() 9782 ) 9783 9784 # Create dataframe 9785 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9786 log.debug(f"query={query}") 9787 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9788 9789 vaf_normalization_set = [] 9790 9791 # for each sample vaf_normalization 9792 for sample in self.get_header_sample_list(): 9793 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9794 lambda row: vaf_normalization(row, sample=sample), axis=1 9795 ) 9796 vaf_normalization_set.append( 9797 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9798 ) 9799 9800 # Add VAF to FORMAT 9801 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9802 "FORMAT" 9803 ].apply(lambda x: str(x) + ":VAF") 9804 vaf_normalization_set.append( 9805 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9806 ) 9807 9808 # Add vaf_normalization to header 9809 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9810 id=vaf_normalization_tag, 9811 num="1", 9812 type="Float", 9813 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9814 type_code=self.code_type_map.get("Float"), 9815 ) 9816 9817 # Create fields to add in INFO 9818 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9819 9820 # Update 9821 sql_update = f""" 9822 UPDATE {table_variants} 9823 SET {sql_vaf_normalization_set} 9824 FROM dataframe_vaf_normalization 9825 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9826 9827 """ 9828 self.conn.execute(sql_update) 9829 9830 # Remove added columns 9831 for added_column in added_columns: 9832 self.drop_column(column=added_column) 9833 9834 # Delete dataframe 9835 del dataframe_vaf_normalization 9836 gc.collect() 9837 9838 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9839 """ 9840 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9841 field in a VCF file and updates the INFO column of the variants table with the calculated 9842 statistics. 9843 9844 :param info: The `info` parameter is a string that represents the type of information for which 9845 genotype statistics are calculated. It is used to generate various VCF info tags for the 9846 statistics, such as the number of occurrences, the list of values, the minimum value, the 9847 maximum value, the mean, the median, defaults to VAF 9848 :type info: str (optional) 9849 """ 9850 9851 # if FORMAT and samples 9852 if ( 9853 "FORMAT" in self.get_header_columns_as_list() 9854 and self.get_header_sample_list() 9855 ): 9856 9857 # vaf_stats annotation field 9858 vaf_stats_tag = info + "_stats" 9859 9860 # VCF infos tags 9861 vcf_infos_tags = { 9862 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9863 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9864 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9865 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9866 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9867 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9868 info 9869 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9870 } 9871 9872 # Prefix 9873 prefix = self.get_explode_infos_prefix() 9874 9875 # Field 9876 vaf_stats_infos = prefix + vaf_stats_tag 9877 9878 # Variants table 9879 table_variants = self.get_table_variants() 9880 9881 # Header 9882 vcf_reader = self.get_header() 9883 9884 # Create variant id 9885 variant_id_column = self.get_variant_id_column() 9886 added_columns = [variant_id_column] 9887 9888 # variant_id, FORMAT and samples 9889 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9890 self.get_header_sample_list() 9891 ) 9892 9893 # Create dataframe 9894 dataframe_vaf_stats = self.get_query_to_df( 9895 f""" SELECT {samples_fields} FROM {table_variants} """ 9896 ) 9897 9898 # Create vaf_stats column 9899 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9900 lambda row: genotype_stats( 9901 row, samples=self.get_header_sample_list(), info=info 9902 ), 9903 axis=1, 9904 ) 9905 9906 # List of vcf tags 9907 sql_vaf_stats_fields = [] 9908 9909 # Check all VAF stats infos 9910 for stat in vcf_infos_tags: 9911 9912 # Extract stats 9913 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9914 lambda x: dict(x).get(stat, "") 9915 ) 9916 9917 # Add snpeff_hgvs to header 9918 vcf_reader.infos[stat] = vcf.parser._Info( 9919 stat, 9920 ".", 9921 "String", 9922 vcf_infos_tags.get(stat, "genotype statistics"), 9923 "howard calculation", 9924 "0", 9925 self.code_type_map.get("String"), 9926 ) 9927 9928 if len(sql_vaf_stats_fields): 9929 sep = ";" 9930 else: 9931 sep = "" 9932 9933 # Create fields to add in INFO 9934 sql_vaf_stats_fields.append( 9935 f""" 9936 CASE 9937 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9938 THEN concat( 9939 '{sep}{stat}=', 9940 dataframe_vaf_stats."{stat}" 9941 ) 9942 ELSE '' 9943 END 9944 """ 9945 ) 9946 9947 # SQL set for update 9948 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9949 9950 # Update 9951 sql_update = f""" 9952 UPDATE {table_variants} 9953 SET "INFO" = 9954 concat( 9955 CASE 9956 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9957 THEN '' 9958 ELSE concat("INFO", ';') 9959 END, 9960 {sql_vaf_stats_fields_set} 9961 ) 9962 FROM dataframe_vaf_stats 9963 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9964 9965 """ 9966 self.conn.execute(sql_update) 9967 9968 # Remove added columns 9969 for added_column in added_columns: 9970 self.drop_column(column=added_column) 9971 9972 # Delete dataframe 9973 del dataframe_vaf_stats 9974 gc.collect() 9975 9976 def calculation_transcripts_annotation( 9977 self, info_json: str = None, info_format: str = None 9978 ) -> None: 9979 """ 9980 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9981 field to it if transcripts are available. 9982 9983 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9984 is a string parameter that represents the information field to be used in the transcripts JSON. 9985 It is used to specify the JSON format for the transcripts information. If no value is provided 9986 when calling the method, it defaults to " 9987 :type info_json: str 9988 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9989 method is a string parameter that specifies the format of the information field to be used in 9990 the transcripts JSON. It is used to define the format of the information field 9991 :type info_format: str 9992 """ 9993 9994 # Create transcripts table 9995 transcripts_table = self.create_transcript_view() 9996 9997 # Add info field 9998 if transcripts_table: 9999 self.transcript_view_to_variants( 10000 transcripts_table=transcripts_table, 10001 transcripts_info_field_json=info_json, 10002 transcripts_info_field_format=info_format, 10003 ) 10004 else: 10005 log.info("No Transcripts to process. Check param.json file configuration") 10006 10007 def calculation_transcripts_prioritization(self) -> None: 10008 """ 10009 The function `calculation_transcripts_prioritization` creates a transcripts table and 10010 prioritizes transcripts based on certain criteria. 10011 """ 10012 10013 # Create transcripts table 10014 transcripts_table = self.create_transcript_view() 10015 10016 # Add info field 10017 if transcripts_table: 10018 self.transcripts_prioritization(transcripts_table=transcripts_table) 10019 else: 10020 log.info("No Transcripts to process. Check param.json file configuration") 10021 10022 def calculation_transcripts_export(self) -> None: 10023 """ """ 10024 10025 # Create transcripts table 10026 transcripts_table = self.create_transcript_view() 10027 10028 # Add info field 10029 if transcripts_table: 10030 self.transcripts_export(transcripts_table=transcripts_table) 10031 else: 10032 log.info("No Transcripts to process. Check param.json file configuration") 10033 10034 ############### 10035 # Transcripts # 10036 ############### 10037 10038 def transcripts_export( 10039 self, transcripts_table: str = None, param: dict = {} 10040 ) -> bool: 10041 """ """ 10042 10043 log.debug("Start transcripts export...") 10044 10045 # Param 10046 if not param: 10047 param = self.get_param() 10048 10049 # Param export 10050 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10051 10052 # Output file 10053 transcripts_export_output = param_transcript_export.get("output", None) 10054 10055 if not param_transcript_export or not transcripts_export_output: 10056 log.warning(f"No transcriipts export parameters defined!") 10057 return False 10058 10059 # List of transcripts annotations 10060 query_describe = f""" 10061 SELECT column_name 10062 FROM ( 10063 DESCRIBE SELECT * FROM {transcripts_table} 10064 ) 10065 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10066 """ 10067 transcripts_annotations_list = list( 10068 self.get_query_to_df(query=query_describe)["column_name"] 10069 ) 10070 10071 # Create transcripts table for export 10072 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10073 random.choices(string.ascii_uppercase + string.digits, k=10) 10074 ) 10075 query_create_transcripts_table_export = f""" 10076 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10077 """ 10078 self.execute_query(query=query_create_transcripts_table_export) 10079 10080 # Output file format 10081 transcripts_export_output_format = get_file_format( 10082 filename=transcripts_export_output 10083 ) 10084 10085 # Format VCF - construct INFO 10086 if transcripts_export_output_format in ["vcf"]: 10087 10088 # Construct query update INFO and header 10089 query_update_info = [] 10090 for field in transcripts_annotations_list: 10091 10092 # If field not in header 10093 if field not in self.get_header_infos_list(): 10094 10095 # Add PZ Transcript in header 10096 self.get_header().infos[field] = vcf.parser._Info( 10097 field, 10098 ".", 10099 "String", 10100 f"Annotation '{field}' from transcript view", 10101 "unknown", 10102 "unknown", 10103 0, 10104 ) 10105 10106 # Add field as INFO/tag 10107 query_update_info.append( 10108 f""" 10109 CASE 10110 WHEN "{field}" IS NOT NULL 10111 THEN concat('{field}=', "{field}", ';') 10112 ELSE '' 10113 END 10114 """ 10115 ) 10116 10117 # Query param 10118 query_update_info_value = ( 10119 f""" concat('', {", ".join(query_update_info)}) """ 10120 ) 10121 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10122 10123 else: 10124 10125 # Query param 10126 query_update_info_value = f""" NULL """ 10127 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10128 10129 # Update query INFO column 10130 query_update = f""" 10131 UPDATE {transcripts_table_export} 10132 SET INFO = {query_update_info_value} 10133 10134 """ 10135 self.execute_query(query=query_update) 10136 10137 # Export 10138 self.export_output( 10139 output_file=transcripts_export_output, 10140 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10141 ) 10142 10143 # Drop transcripts export table 10144 query_drop_transcripts_table_export = f""" 10145 DROP TABLE {transcripts_table_export} 10146 """ 10147 self.execute_query(query=query_drop_transcripts_table_export) 10148 10149 def transcripts_prioritization( 10150 self, transcripts_table: str = None, param: dict = {} 10151 ) -> bool: 10152 """ 10153 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10154 and updates the variants table with the prioritized information. 10155 10156 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10157 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10158 This parameter is used to identify the table where the transcripts data is stored for the 10159 prioritization process 10160 :type transcripts_table: str 10161 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10162 that contains various configuration settings for the prioritization process of transcripts. It 10163 is used to customize the behavior of the prioritization algorithm and includes settings such as 10164 the prefix for prioritization fields, default profiles, and other 10165 :type param: dict 10166 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10167 transcripts prioritization process is successfully completed, and `False` if there are any 10168 issues or if no profile is defined for transcripts prioritization. 10169 """ 10170 10171 log.debug("Start transcripts prioritization...") 10172 10173 # Param 10174 if not param: 10175 param = self.get_param() 10176 10177 # Variants table 10178 table_variants = self.get_table_variants() 10179 10180 # Transcripts table 10181 if transcripts_table is None: 10182 transcripts_table = self.create_transcript_view( 10183 transcripts_table="transcripts", param=param 10184 ) 10185 if transcripts_table is None: 10186 msg_err = "No Transcripts table availalble" 10187 log.error(msg_err) 10188 raise ValueError(msg_err) 10189 log.debug(f"transcripts_table={transcripts_table}") 10190 10191 # Get transcripts columns 10192 columns_as_list_query = f""" 10193 DESCRIBE {transcripts_table} 10194 """ 10195 columns_as_list = list( 10196 self.get_query_to_df(columns_as_list_query)["column_name"] 10197 ) 10198 10199 # Create INFO if not exists 10200 if "INFO" not in columns_as_list: 10201 query_add_info = f""" 10202 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10203 """ 10204 self.execute_query(query_add_info) 10205 10206 # Prioritization param and Force only PZ Score and Flag 10207 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10208 10209 # PZ profile by default 10210 pz_profile_default = ( 10211 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10212 ) 10213 10214 # Exit if no profile 10215 if pz_profile_default is None: 10216 log.warning("No profile defined for transcripts prioritization") 10217 return False 10218 10219 # PZ fields 10220 pz_param_pzfields = {} 10221 10222 # PZ field transcripts 10223 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10224 10225 # Add PZ Transcript in header 10226 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10227 pz_fields_transcripts, 10228 ".", 10229 "String", 10230 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10231 "unknown", 10232 "unknown", 10233 code_type_map["String"], 10234 ) 10235 10236 # Mandatory fields 10237 pz_mandatory_fields_list = [ 10238 "Score", 10239 "Flag", 10240 "Tags", 10241 "Comment", 10242 "Infos", 10243 "Class", 10244 ] 10245 pz_mandatory_fields = [] 10246 for pz_mandatory_field in pz_mandatory_fields_list: 10247 pz_mandatory_fields.append( 10248 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10249 ) 10250 10251 # PZ fields in param 10252 for pz_field in pz_param.get("pzfields", []): 10253 if pz_field in pz_mandatory_fields_list: 10254 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10255 pz_param.get("pzprefix", "PTZ") + pz_field 10256 ) 10257 else: 10258 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10259 pz_param_pzfields[pz_field] = pz_field_new 10260 10261 # Add PZ Transcript in header 10262 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10263 pz_field_new, 10264 ".", 10265 "String", 10266 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10267 "unknown", 10268 "unknown", 10269 code_type_map["String"], 10270 ) 10271 10272 # PZ fields param 10273 pz_param["pzfields"] = pz_mandatory_fields 10274 10275 # Prioritization 10276 prioritization_result = self.prioritization( 10277 table=transcripts_table, 10278 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10279 ) 10280 if not prioritization_result: 10281 log.warning("Transcripts prioritization not processed") 10282 return False 10283 10284 # PZ fields sql query 10285 query_update_select_list = [] 10286 query_update_concat_list = [] 10287 query_update_order_list = [] 10288 for pz_param_pzfield in set( 10289 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10290 ): 10291 query_update_select_list.append(f" {pz_param_pzfield}, ") 10292 10293 for pz_param_pzfield in pz_param_pzfields: 10294 query_update_concat_list.append( 10295 f""" 10296 , CASE 10297 WHEN {pz_param_pzfield} IS NOT NULL 10298 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10299 ELSE '' 10300 END 10301 """ 10302 ) 10303 10304 # Order by 10305 pz_orders = ( 10306 param.get("transcripts", {}) 10307 .get("prioritization", {}) 10308 .get("prioritization_transcripts_order", {}) 10309 ) 10310 if not pz_orders: 10311 pz_orders = { 10312 pz_param.get("pzprefix", "PTZ") + "Flag": "ASC", 10313 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10314 } 10315 for pz_order in pz_orders: 10316 query_update_order_list.append( 10317 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10318 ) 10319 10320 # Fields to explode 10321 fields_to_explode = ( 10322 list(pz_param_pzfields.keys()) 10323 + pz_mandatory_fields 10324 + list(pz_orders.keys()) 10325 ) 10326 # Remove transcript column as a specific transcript column 10327 if "transcript" in fields_to_explode: 10328 fields_to_explode.remove("transcript") 10329 10330 # Fields intranscripts table 10331 query_transcripts_table = f""" 10332 DESCRIBE SELECT * FROM {transcripts_table} 10333 """ 10334 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10335 10336 # Check fields to explode 10337 for field_to_explode in fields_to_explode: 10338 if field_to_explode not in self.get_header_infos_list() + list( 10339 query_transcripts_table.column_name 10340 ): 10341 msg_err = f"INFO/{field_to_explode} NOT IN header" 10342 log.error(msg_err) 10343 raise ValueError(msg_err) 10344 10345 # Explode fields to explode 10346 self.explode_infos( 10347 table=transcripts_table, 10348 fields=fields_to_explode, 10349 ) 10350 10351 # Transcript preference file 10352 transcripts_preference_file = ( 10353 param.get("transcripts", {}) 10354 .get("prioritization", {}) 10355 .get("prioritization_transcripts", {}) 10356 ) 10357 transcripts_preference_file = full_path(transcripts_preference_file) 10358 10359 # Transcript preference forced 10360 transcript_preference_force = ( 10361 param.get("transcripts", {}) 10362 .get("prioritization", {}) 10363 .get("prioritization_transcripts_force", False) 10364 ) 10365 # Transcript version forced 10366 transcript_version_force = ( 10367 param.get("transcripts", {}) 10368 .get("prioritization", {}) 10369 .get("prioritization_transcripts_version_force", False) 10370 ) 10371 10372 # Transcripts Ranking 10373 if transcripts_preference_file: 10374 10375 # Transcripts file to dataframe 10376 if os.path.exists(transcripts_preference_file): 10377 transcripts_preference_dataframe = transcripts_file_to_df( 10378 transcripts_preference_file 10379 ) 10380 else: 10381 log.error( 10382 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10383 ) 10384 raise ValueError( 10385 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10386 ) 10387 10388 # Order by depending to transcript preference forcing 10389 if transcript_preference_force: 10390 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10391 else: 10392 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10393 10394 # Transcript columns joined depend on version consideration 10395 if transcript_version_force: 10396 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10397 else: 10398 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10399 10400 # Query ranking for update 10401 query_update_ranking = f""" 10402 SELECT 10403 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10404 ROW_NUMBER() OVER ( 10405 PARTITION BY "#CHROM", POS, REF, ALT 10406 ORDER BY {order_by} 10407 ) AS rn 10408 FROM {transcripts_table} 10409 LEFT JOIN 10410 ( 10411 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10412 FROM transcripts_preference_dataframe 10413 ) AS transcripts_preference 10414 ON {transcripts_version_join} 10415 """ 10416 10417 else: 10418 10419 # Query ranking for update 10420 query_update_ranking = f""" 10421 SELECT 10422 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10423 ROW_NUMBER() OVER ( 10424 PARTITION BY "#CHROM", POS, REF, ALT 10425 ORDER BY {" , ".join(query_update_order_list)} 10426 ) AS rn 10427 FROM {transcripts_table} 10428 """ 10429 10430 # Export Transcripts prioritization infos to variants table 10431 query_update = f""" 10432 WITH RankedTranscripts AS ( 10433 {query_update_ranking} 10434 ) 10435 UPDATE {table_variants} 10436 SET 10437 INFO = CONCAT(CASE 10438 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10439 THEN '' 10440 ELSE concat("INFO", ';') 10441 END, 10442 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10443 ) 10444 FROM 10445 RankedTranscripts 10446 WHERE 10447 rn = 1 10448 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10449 AND variants."POS" = RankedTranscripts."POS" 10450 AND variants."REF" = RankedTranscripts."REF" 10451 AND variants."ALT" = RankedTranscripts."ALT" 10452 """ 10453 10454 # log.debug(f"query_update={query_update}") 10455 self.execute_query(query=query_update) 10456 10457 # Return 10458 return True 10459 10460 def create_transcript_view_from_columns_map( 10461 self, 10462 transcripts_table: str = "transcripts", 10463 columns_maps: dict = {}, 10464 added_columns: list = [], 10465 temporary_tables: list = None, 10466 annotation_fields: list = None, 10467 column_rename: dict = {}, 10468 column_clean: bool = False, 10469 column_case: str = None, 10470 ) -> tuple[list, list, list]: 10471 """ 10472 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10473 specified columns mapping for transcripts data. 10474 10475 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10476 of the table where the transcripts data is stored or will be stored in the database. This table 10477 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10478 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10479 :type transcripts_table: str (optional) 10480 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10481 about how to map columns from a transcripts table to create a view. Each entry in the 10482 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10483 typically includes details such as the main transcript column and additional information columns 10484 :type columns_maps: dict 10485 :param added_columns: The `added_columns` parameter in the 10486 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10487 that will be added to the view being created based on the columns map provided. These columns 10488 are generated by exploding the transcript information columns along with the main transcript 10489 column 10490 :type added_columns: list 10491 :param temporary_tables: The `temporary_tables` parameter in the 10492 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10493 tables created during the process of creating a transcript view from a columns map. These 10494 temporary tables are used to store intermediate results or transformations before the final view 10495 is generated 10496 :type temporary_tables: list 10497 :param annotation_fields: The `annotation_fields` parameter in the 10498 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10499 used for annotation in the query view creation process. These fields are extracted from the 10500 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10501 :type annotation_fields: list 10502 :param column_rename: The `column_rename` parameter in the 10503 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10504 custom renaming for columns during the creation of the temporary table view. This parameter 10505 provides a mapping of original column names to the desired renamed column names. By using this 10506 parameter, 10507 :type column_rename: dict 10508 :param column_clean: The `column_clean` parameter in the 10509 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10510 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10511 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10512 False 10513 :type column_clean: bool (optional) 10514 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10515 function is used to specify the case transformation to be applied to the columns during the view 10516 creation process. It allows you to control whether the column values should be converted to 10517 lowercase, uppercase, or remain unchanged 10518 :type column_case: str 10519 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10520 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10521 """ 10522 10523 log.debug("Start transcrpts view creation from columns map...") 10524 10525 # "from_columns_map": [ 10526 # { 10527 # "transcripts_column": "Ensembl_transcriptid", 10528 # "transcripts_infos_columns": [ 10529 # "genename", 10530 # "Ensembl_geneid", 10531 # "LIST_S2_score", 10532 # "LIST_S2_pred", 10533 # ], 10534 # }, 10535 # { 10536 # "transcripts_column": "Ensembl_transcriptid", 10537 # "transcripts_infos_columns": [ 10538 # "genename", 10539 # "VARITY_R_score", 10540 # "Aloft_pred", 10541 # ], 10542 # }, 10543 # ], 10544 10545 # Init 10546 if temporary_tables is None: 10547 temporary_tables = [] 10548 if annotation_fields is None: 10549 annotation_fields = [] 10550 10551 # Variants table 10552 table_variants = self.get_table_variants() 10553 10554 for columns_map in columns_maps: 10555 10556 # Transcript column 10557 transcripts_column = columns_map.get("transcripts_column", None) 10558 10559 # Transcripts infos columns 10560 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10561 10562 # Transcripts infos columns rename 10563 column_rename = columns_map.get("column_rename", column_rename) 10564 10565 # Transcripts infos columns clean 10566 column_clean = columns_map.get("column_clean", column_clean) 10567 10568 # Transcripts infos columns case 10569 column_case = columns_map.get("column_case", column_case) 10570 10571 if transcripts_column is not None: 10572 10573 # Explode 10574 added_columns += self.explode_infos( 10575 fields=[transcripts_column] + transcripts_infos_columns 10576 ) 10577 10578 # View clauses 10579 clause_select_variants = [] 10580 clause_select_tanscripts = [] 10581 for field in [transcripts_column] + transcripts_infos_columns: 10582 10583 # AS field 10584 as_field = field 10585 10586 # Rename 10587 if column_rename: 10588 as_field = column_rename.get(as_field, as_field) 10589 10590 # Clean 10591 if column_clean: 10592 as_field = clean_annotation_field(as_field) 10593 10594 # Case 10595 if column_case: 10596 if column_case.lower() in ["lower"]: 10597 as_field = as_field.lower() 10598 elif column_case.lower() in ["upper"]: 10599 as_field = as_field.upper() 10600 10601 # Clause select Variants 10602 clause_select_variants.append( 10603 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10604 ) 10605 10606 if field in [transcripts_column]: 10607 clause_select_tanscripts.append( 10608 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10609 ) 10610 else: 10611 clause_select_tanscripts.append( 10612 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10613 ) 10614 annotation_fields.append(as_field) 10615 10616 # Querey View 10617 query = f""" 10618 SELECT 10619 "#CHROM", POS, REF, ALT, INFO, 10620 "{transcripts_column}" AS 'transcript', 10621 {", ".join(clause_select_tanscripts)} 10622 FROM ( 10623 SELECT 10624 "#CHROM", POS, REF, ALT, INFO, 10625 {", ".join(clause_select_variants)} 10626 FROM {table_variants} 10627 ) 10628 WHERE "{transcripts_column}" IS NOT NULL 10629 """ 10630 10631 # Create temporary table 10632 temporary_table = transcripts_table + "".join( 10633 random.choices(string.ascii_uppercase + string.digits, k=10) 10634 ) 10635 10636 # Temporary_tables 10637 temporary_tables.append(temporary_table) 10638 query_view = f""" 10639 CREATE TEMPORARY TABLE {temporary_table} 10640 AS ({query}) 10641 """ 10642 self.execute_query(query=query_view) 10643 10644 return added_columns, temporary_tables, annotation_fields 10645 10646 def create_transcript_view_from_column_format( 10647 self, 10648 transcripts_table: str = "transcripts", 10649 column_formats: dict = {}, 10650 temporary_tables: list = None, 10651 annotation_fields: list = None, 10652 column_rename: dict = {}, 10653 column_clean: bool = False, 10654 column_case: str = None, 10655 ) -> tuple[list, list, list]: 10656 """ 10657 The `create_transcript_view_from_column_format` function generates a transcript view based on 10658 specified column formats, adds additional columns and annotation fields, and returns the list of 10659 temporary tables and annotation fields. 10660 10661 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10662 of the table containing the transcripts data. This table will be used as the base table for 10663 creating the transcript view. The default value for this parameter is "transcripts", but you can 10664 provide a different table name if needed, defaults to transcripts 10665 :type transcripts_table: str (optional) 10666 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10667 about the columns to be used for creating the transcript view. Each entry in the dictionary 10668 specifies the mapping between a transcripts column and a transcripts infos column. This 10669 parameter allows you to define how the columns from the transcripts table should be transformed 10670 or mapped 10671 :type column_formats: dict 10672 :param temporary_tables: The `temporary_tables` parameter in the 10673 `create_transcript_view_from_column_format` function is a list that stores the names of 10674 temporary views created during the process of creating a transcript view from a column format. 10675 These temporary views are used to manipulate and extract data before generating the final 10676 transcript view 10677 :type temporary_tables: list 10678 :param annotation_fields: The `annotation_fields` parameter in the 10679 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10680 that are extracted from the temporary views created during the process. These annotation fields 10681 are obtained by querying the temporary views and extracting the column names excluding specific 10682 columns like `#CH 10683 :type annotation_fields: list 10684 :param column_rename: The `column_rename` parameter in the 10685 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10686 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10687 column names to new column names in this dictionary, you can rename specific columns during the 10688 process 10689 :type column_rename: dict 10690 :param column_clean: The `column_clean` parameter in the 10691 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10692 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10693 will be cleaned during the creation of the transcript view based on the specified column format, 10694 defaults to False 10695 :type column_clean: bool (optional) 10696 :param column_case: The `column_case` parameter in the 10697 `create_transcript_view_from_column_format` function is used to specify the case transformation 10698 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10699 to convert the column names to uppercase or lowercase, respectively 10700 :type column_case: str 10701 :return: The `create_transcript_view_from_column_format` function returns two lists: 10702 `temporary_tables` and `annotation_fields`. 10703 """ 10704 10705 log.debug("Start transcrpts view creation from column format...") 10706 10707 # "from_column_format": [ 10708 # { 10709 # "transcripts_column": "ANN", 10710 # "transcripts_infos_column": "Feature_ID", 10711 # } 10712 # ], 10713 10714 # Init 10715 if temporary_tables is None: 10716 temporary_tables = [] 10717 if annotation_fields is None: 10718 annotation_fields = [] 10719 10720 for column_format in column_formats: 10721 10722 # annotation field and transcript annotation field 10723 annotation_field = column_format.get("transcripts_column", "ANN") 10724 transcript_annotation = column_format.get( 10725 "transcripts_infos_column", "Feature_ID" 10726 ) 10727 10728 # Transcripts infos columns rename 10729 column_rename = column_format.get("column_rename", column_rename) 10730 10731 # Transcripts infos columns clean 10732 column_clean = column_format.get("column_clean", column_clean) 10733 10734 # Transcripts infos columns case 10735 column_case = column_format.get("column_case", column_case) 10736 10737 # Temporary View name 10738 temporary_view_name = transcripts_table + "".join( 10739 random.choices(string.ascii_uppercase + string.digits, k=10) 10740 ) 10741 10742 # Create temporary view name 10743 temporary_view_name = self.annotation_format_to_table( 10744 uniquify=True, 10745 annotation_field=annotation_field, 10746 view_name=temporary_view_name, 10747 annotation_id=transcript_annotation, 10748 column_rename=column_rename, 10749 column_clean=column_clean, 10750 column_case=column_case, 10751 ) 10752 10753 # Annotation fields 10754 if temporary_view_name: 10755 query_annotation_fields = f""" 10756 SELECT * 10757 FROM ( 10758 DESCRIBE SELECT * 10759 FROM {temporary_view_name} 10760 ) 10761 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10762 """ 10763 df_annotation_fields = self.get_query_to_df( 10764 query=query_annotation_fields 10765 ) 10766 10767 # Add temporary view and annotation fields 10768 temporary_tables.append(temporary_view_name) 10769 annotation_fields += list(set(df_annotation_fields["column_name"])) 10770 10771 return temporary_tables, annotation_fields 10772 10773 def create_transcript_view( 10774 self, 10775 transcripts_table: str = None, 10776 transcripts_table_drop: bool = True, 10777 param: dict = {}, 10778 ) -> str: 10779 """ 10780 The `create_transcript_view` function generates a transcript view by processing data from a 10781 specified table based on provided parameters and structural information. 10782 10783 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10784 is used to specify the name of the table that will store the final transcript view data. If a table 10785 name is not provided, the function will create a new table to store the transcript view data, and by 10786 default,, defaults to transcripts 10787 :type transcripts_table: str (optional) 10788 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10789 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10790 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10791 the function will drop the existing transcripts table if it exists, defaults to True 10792 :type transcripts_table_drop: bool (optional) 10793 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10794 contains information needed to create a transcript view. It includes details such as the structure 10795 of the transcripts, columns mapping, column formats, and other necessary information for generating 10796 the view. This parameter allows for flexibility and customization 10797 :type param: dict 10798 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10799 created or modified during the execution of the function. 10800 """ 10801 10802 log.debug("Start transcripts view creation...") 10803 10804 # Default 10805 transcripts_table_default = "transcripts" 10806 10807 # Param 10808 if not param: 10809 param = self.get_param() 10810 10811 # Struct 10812 struct = param.get("transcripts", {}).get("struct", None) 10813 10814 # Transcript veresion 10815 transcript_id_remove_version = param.get("transcripts", {}).get( 10816 "transcript_id_remove_version", False 10817 ) 10818 10819 # Transcripts mapping 10820 transcript_id_mapping_file = param.get("transcripts", {}).get( 10821 "transcript_id_mapping_file", None 10822 ) 10823 10824 # Transcripts mapping 10825 transcript_id_mapping_force = param.get("transcripts", {}).get( 10826 "transcript_id_mapping_force", None 10827 ) 10828 10829 if struct: 10830 10831 # Transcripts table 10832 if transcripts_table is None: 10833 transcripts_table = param.get("transcripts", {}).get( 10834 "table", transcripts_table_default 10835 ) 10836 10837 # added_columns 10838 added_columns = [] 10839 10840 # Temporary tables 10841 temporary_tables = [] 10842 10843 # Annotation fields 10844 annotation_fields = [] 10845 10846 # from columns map 10847 columns_maps = struct.get("from_columns_map", []) 10848 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10849 self.create_transcript_view_from_columns_map( 10850 transcripts_table=transcripts_table, 10851 columns_maps=columns_maps, 10852 added_columns=added_columns, 10853 temporary_tables=temporary_tables, 10854 annotation_fields=annotation_fields, 10855 ) 10856 ) 10857 added_columns += added_columns_tmp 10858 temporary_tables += temporary_tables_tmp 10859 annotation_fields += annotation_fields_tmp 10860 10861 # from column format 10862 column_formats = struct.get("from_column_format", []) 10863 temporary_tables_tmp, annotation_fields_tmp = ( 10864 self.create_transcript_view_from_column_format( 10865 transcripts_table=transcripts_table, 10866 column_formats=column_formats, 10867 temporary_tables=temporary_tables, 10868 annotation_fields=annotation_fields, 10869 ) 10870 ) 10871 temporary_tables += temporary_tables_tmp 10872 annotation_fields += annotation_fields_tmp 10873 10874 # Remove some specific fields/column 10875 annotation_fields = list(set(annotation_fields)) 10876 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10877 if field in annotation_fields: 10878 annotation_fields.remove(field) 10879 10880 # Merge temporary tables query 10881 query_merge = "" 10882 for temporary_table in list(set(temporary_tables)): 10883 10884 # First temporary table 10885 if not query_merge: 10886 query_merge = f""" 10887 SELECT * FROM {temporary_table} 10888 """ 10889 # other temporary table (using UNION) 10890 else: 10891 query_merge += f""" 10892 UNION BY NAME SELECT * FROM {temporary_table} 10893 """ 10894 10895 # transcript table tmp 10896 transcript_table_tmp = "transcripts_tmp" 10897 transcript_table_tmp2 = "transcripts_tmp2" 10898 transcript_table_tmp3 = "transcripts_tmp3" 10899 10900 # Merge on transcript 10901 query_merge_on_transcripts_annotation_fields = [] 10902 10903 # Add transcript list 10904 query_merge_on_transcripts_annotation_fields.append( 10905 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 10906 ) 10907 10908 # Aggregate all annotations fields 10909 for annotation_field in set(annotation_fields): 10910 query_merge_on_transcripts_annotation_fields.append( 10911 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10912 ) 10913 10914 # Transcripts mapping 10915 if transcript_id_mapping_file: 10916 10917 # Transcript dataframe 10918 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 10919 transcript_id_mapping_dataframe = transcripts_file_to_df( 10920 transcript_id_mapping_file, column_names=["transcript", "alias"] 10921 ) 10922 10923 # Transcript version remove 10924 if transcript_id_remove_version: 10925 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 10926 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 10927 query_left_join = f""" 10928 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10929 """ 10930 else: 10931 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 10932 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 10933 query_left_join = f""" 10934 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10935 """ 10936 10937 # Transcript column for group by merge 10938 query_transcript_merge_group_by = """ 10939 CASE 10940 WHEN transcript_mapped NOT IN ('') 10941 THEN split_part(transcript_mapped, '.', 1) 10942 ELSE split_part(transcript_original, '.', 1) 10943 END 10944 """ 10945 10946 # Merge query 10947 transcripts_tmp2_query = f""" 10948 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 10949 FROM ({query_merge}) AS {transcript_table_tmp} 10950 {query_left_join} 10951 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 10952 """ 10953 10954 # Retrive columns after mege 10955 transcripts_tmp2_describe_query = f""" 10956 DESCRIBE {transcripts_tmp2_query} 10957 """ 10958 transcripts_tmp2_describe_list = list( 10959 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 10960 "column_name" 10961 ] 10962 ) 10963 10964 # Create list of columns for select clause 10965 transcripts_tmp2_describe_select_clause = [] 10966 for field in transcripts_tmp2_describe_list: 10967 if field not in [ 10968 "#CHROM", 10969 "POS", 10970 "REF", 10971 "ALT", 10972 "INFO", 10973 "transcript_mapped", 10974 ]: 10975 as_field = field 10976 if field in ["transcript_original"]: 10977 as_field = "transcripts_mapped" 10978 transcripts_tmp2_describe_select_clause.append( 10979 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 10980 ) 10981 10982 # Merge with mapping 10983 query_merge_on_transcripts = f""" 10984 SELECT 10985 "#CHROM", POS, REF, ALT, INFO, 10986 CASE 10987 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 10988 THEN ANY_VALUE(transcript_mapped) 10989 ELSE ANY_VALUE(transcript_original) 10990 END AS transcript, 10991 {", ".join(transcripts_tmp2_describe_select_clause)} 10992 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 10993 GROUP BY "#CHROM", POS, REF, ALT, INFO, 10994 {query_transcript_merge_group_by} 10995 """ 10996 10997 # Add transcript filter from mapping file 10998 if transcript_id_mapping_force: 10999 query_merge_on_transcripts = f""" 11000 SELECT * 11001 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11002 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11003 """ 11004 11005 # No transcript mapping 11006 else: 11007 11008 # Remove transcript version 11009 if transcript_id_remove_version: 11010 query_transcript_column = f""" 11011 split_part({transcript_table_tmp}.transcript, '.', 1) 11012 """ 11013 else: 11014 query_transcript_column = """ 11015 transcript 11016 """ 11017 11018 # Query sections 11019 query_transcript_column_select = ( 11020 f"{query_transcript_column} AS transcript" 11021 ) 11022 query_transcript_column_group_by = query_transcript_column 11023 11024 # Query for transcripts view 11025 query_merge_on_transcripts = f""" 11026 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11027 FROM ({query_merge}) AS {transcript_table_tmp} 11028 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11029 """ 11030 11031 log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}") 11032 11033 # Drop transcript view is necessary 11034 if transcripts_table_drop: 11035 query_drop = f""" 11036 DROP TABLE IF EXISTS {transcripts_table}; 11037 """ 11038 self.execute_query(query=query_drop) 11039 11040 # Merge and create transcript view 11041 query_create_view = f""" 11042 CREATE TABLE IF NOT EXISTS {transcripts_table} 11043 AS {query_merge_on_transcripts} 11044 """ 11045 self.execute_query(query=query_create_view) 11046 11047 # Remove added columns 11048 for added_column in added_columns: 11049 self.drop_column(column=added_column) 11050 11051 else: 11052 11053 transcripts_table = None 11054 11055 return transcripts_table 11056 11057 def annotation_format_to_table( 11058 self, 11059 uniquify: bool = True, 11060 annotation_field: str = "ANN", 11061 annotation_id: str = "Feature_ID", 11062 view_name: str = "transcripts", 11063 column_rename: dict = {}, 11064 column_clean: bool = False, 11065 column_case: str = None, 11066 ) -> str: 11067 """ 11068 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11069 structured table format, ensuring unique values and creating a temporary table for further 11070 processing or analysis. 11071 11072 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11073 unique values in the output or not. If set to `True`, the function will make sure that the 11074 output values are unique, defaults to True 11075 :type uniquify: bool (optional) 11076 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11077 that contains the annotation information for each variant. This field is used to extract the 11078 annotation details for further processing in the function. By default, it is set to "ANN", 11079 defaults to ANN 11080 :type annotation_field: str (optional) 11081 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11082 is used to specify the identifier for the annotation feature. This identifier will be used as a 11083 column name in the resulting table or view that is created based on the annotation data. It 11084 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11085 :type annotation_id: str (optional) 11086 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11087 to specify the name of the temporary table that will be created to store the transformed 11088 annotation data. This table will hold the extracted information from the annotation field in a 11089 structured format for further processing or analysis. By default,, defaults to transcripts 11090 :type view_name: str (optional) 11091 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11092 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11093 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11094 created based on the annotation data. This feature enables 11095 :type column_rename: dict 11096 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11097 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11098 If set to `True`, the function will clean the annotation field before further processing. This 11099 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11100 to False 11101 :type column_clean: bool (optional) 11102 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11103 used to specify the case transformation to be applied to the column names extracted from the 11104 annotation data. It allows you to set the case of the column names to either lowercase or 11105 uppercase for consistency or other specific requirements during the conversion 11106 :type column_case: str 11107 :return: The function `annotation_format_to_table` is returning the name of the view created, 11108 which is stored in the variable `view_name`. 11109 """ 11110 11111 # Annotation field 11112 annotation_format = "annotation_explode" 11113 11114 # Transcript annotation 11115 if column_rename: 11116 annotation_id = column_rename.get(annotation_id, annotation_id) 11117 11118 if column_clean: 11119 annotation_id = clean_annotation_field(annotation_id) 11120 11121 # Prefix 11122 prefix = self.get_explode_infos_prefix() 11123 if prefix: 11124 prefix = "INFO/" 11125 11126 # Annotation fields 11127 annotation_infos = prefix + annotation_field 11128 annotation_format_infos = prefix + annotation_format 11129 11130 # Variants table 11131 table_variants = self.get_table_variants() 11132 11133 # Header 11134 vcf_reader = self.get_header() 11135 11136 # Add columns 11137 added_columns = [] 11138 11139 # Explode HGVS field in column 11140 added_columns += self.explode_infos(fields=[annotation_field]) 11141 11142 if annotation_field in vcf_reader.infos: 11143 11144 # Extract ANN header 11145 ann_description = vcf_reader.infos[annotation_field].desc 11146 pattern = r"'(.+?)'" 11147 match = re.search(pattern, ann_description) 11148 if match: 11149 ann_header_match = match.group(1).split(" | ") 11150 ann_header = [] 11151 ann_header_desc = {} 11152 for i in range(len(ann_header_match)): 11153 ann_header_info = "".join( 11154 char for char in ann_header_match[i] if char.isalnum() 11155 ) 11156 ann_header.append(ann_header_info) 11157 ann_header_desc[ann_header_info] = ann_header_match[i] 11158 if not ann_header_desc: 11159 raise ValueError("Invalid header description format") 11160 else: 11161 raise ValueError("Invalid header description format") 11162 11163 # Create variant id 11164 variant_id_column = self.get_variant_id_column() 11165 added_columns += [variant_id_column] 11166 11167 # Create dataframe 11168 dataframe_annotation_format = self.get_query_to_df( 11169 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 11170 ) 11171 11172 # Create annotation columns 11173 dataframe_annotation_format[ 11174 annotation_format_infos 11175 ] = dataframe_annotation_format[annotation_infos].apply( 11176 lambda x: explode_annotation_format( 11177 annotation=str(x), 11178 uniquify=uniquify, 11179 output_format="JSON", 11180 prefix="", 11181 header=list(ann_header_desc.values()), 11182 ) 11183 ) 11184 11185 # Find keys 11186 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 11187 df_keys = self.get_query_to_df(query=query_json) 11188 11189 # Check keys 11190 query_json_key = [] 11191 for _, row in df_keys.iterrows(): 11192 11193 # Key 11194 key = row.iloc[0] 11195 key_clean = key 11196 11197 # key rename 11198 if column_rename: 11199 key_clean = column_rename.get(key_clean, key_clean) 11200 11201 # key clean 11202 if column_clean: 11203 key_clean = clean_annotation_field(key_clean) 11204 11205 # Key case 11206 if column_case: 11207 if column_case.lower() in ["lower"]: 11208 key_clean = key_clean.lower() 11209 elif column_case.lower() in ["upper"]: 11210 key_clean = key_clean.upper() 11211 11212 # Type 11213 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 11214 11215 # Get DataFrame from query 11216 df_json_type = self.get_query_to_df(query=query_json_type) 11217 11218 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 11219 with pd.option_context("future.no_silent_downcasting", True): 11220 df_json_type.fillna(value="", inplace=True) 11221 replace_dict = {None: np.nan, "": np.nan} 11222 df_json_type.replace(replace_dict, inplace=True) 11223 df_json_type.dropna(inplace=True) 11224 11225 # Detect column type 11226 column_type = detect_column_type(df_json_type[key_clean]) 11227 11228 # Append 11229 query_json_key.append( 11230 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11231 ) 11232 11233 # Create view 11234 query_view = f""" 11235 CREATE TEMPORARY TABLE {view_name} 11236 AS ( 11237 SELECT *, {annotation_id} AS 'transcript' 11238 FROM ( 11239 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11240 FROM dataframe_annotation_format 11241 ) 11242 ); 11243 """ 11244 self.execute_query(query=query_view) 11245 11246 else: 11247 11248 # Return None 11249 view_name = None 11250 11251 # Remove added columns 11252 for added_column in added_columns: 11253 self.drop_column(column=added_column) 11254 11255 return view_name 11256 11257 def transcript_view_to_variants( 11258 self, 11259 transcripts_table: str = None, 11260 transcripts_column_id: str = None, 11261 transcripts_info_json: str = None, 11262 transcripts_info_field_json: str = None, 11263 transcripts_info_format: str = None, 11264 transcripts_info_field_format: str = None, 11265 param: dict = {}, 11266 ) -> bool: 11267 """ 11268 The `transcript_view_to_variants` function updates a variants table with information from 11269 transcripts in JSON format. 11270 11271 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11272 table containing the transcripts data. If this parameter is not provided, the function will 11273 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11274 :type transcripts_table: str 11275 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11276 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11277 identifier is used to match transcripts with variants in the database 11278 :type transcripts_column_id: str 11279 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11280 of the column in the variants table where the transcripts information will be stored in JSON 11281 format. This parameter allows you to define the column in the variants table that will hold the 11282 JSON-formatted information about transcripts 11283 :type transcripts_info_json: str 11284 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11285 specify the field in the VCF header that will contain information about transcripts in JSON 11286 format. This field will be added to the VCF header as an INFO field with the specified name 11287 :type transcripts_info_field_json: str 11288 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11289 format of the information about transcripts that will be stored in the variants table. This 11290 format can be used to define how the transcript information will be structured or displayed 11291 within the variants table 11292 :type transcripts_info_format: str 11293 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11294 specify the field in the VCF header that will contain information about transcripts in a 11295 specific format. This field will be added to the VCF header as an INFO field with the specified 11296 name 11297 :type transcripts_info_field_format: str 11298 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11299 that contains various configuration settings related to transcripts. It is used to provide 11300 default values for certain parameters if they are not explicitly provided when calling the 11301 method. The `param` dictionary can be passed as an argument 11302 :type param: dict 11303 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11304 if the operation is successful and `False` if certain conditions are not met. 11305 """ 11306 11307 msg_info_prefix = "Start transcripts view to variants annotations" 11308 11309 log.debug(f"{msg_info_prefix}...") 11310 11311 # Default 11312 transcripts_table_default = "transcripts" 11313 transcripts_column_id_default = "transcript" 11314 transcripts_info_json_default = None 11315 transcripts_info_format_default = None 11316 transcripts_info_field_json_default = None 11317 transcripts_info_field_format_default = None 11318 11319 # Param 11320 if not param: 11321 param = self.get_param() 11322 11323 # Transcripts table 11324 if transcripts_table is None: 11325 transcripts_table = param.get("transcripts", {}).get( 11326 "table", transcripts_table_default 11327 ) 11328 11329 # Transcripts column ID 11330 if transcripts_column_id is None: 11331 transcripts_column_id = param.get("transcripts", {}).get( 11332 "column_id", transcripts_column_id_default 11333 ) 11334 11335 # Transcripts info json 11336 if transcripts_info_json is None: 11337 transcripts_info_json = param.get("transcripts", {}).get( 11338 "transcripts_info_json", transcripts_info_json_default 11339 ) 11340 11341 # Transcripts info field JSON 11342 if transcripts_info_field_json is None: 11343 transcripts_info_field_json = param.get("transcripts", {}).get( 11344 "transcripts_info_field_json", transcripts_info_field_json_default 11345 ) 11346 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11347 # transcripts_info_json = transcripts_info_field_json 11348 11349 # Transcripts info format 11350 if transcripts_info_format is None: 11351 transcripts_info_format = param.get("transcripts", {}).get( 11352 "transcripts_info_format", transcripts_info_format_default 11353 ) 11354 11355 # Transcripts info field FORMAT 11356 if transcripts_info_field_format is None: 11357 transcripts_info_field_format = param.get("transcripts", {}).get( 11358 "transcripts_info_field_format", transcripts_info_field_format_default 11359 ) 11360 # if ( 11361 # transcripts_info_field_format is not None 11362 # and transcripts_info_format is None 11363 # ): 11364 # transcripts_info_format = transcripts_info_field_format 11365 11366 # Variants table 11367 table_variants = self.get_table_variants() 11368 11369 # Check info columns param 11370 if ( 11371 transcripts_info_json is None 11372 and transcripts_info_field_json is None 11373 and transcripts_info_format is None 11374 and transcripts_info_field_format is None 11375 ): 11376 return False 11377 11378 # Transcripts infos columns 11379 query_transcripts_infos_columns = f""" 11380 SELECT * 11381 FROM ( 11382 DESCRIBE SELECT * FROM {transcripts_table} 11383 ) 11384 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11385 """ 11386 transcripts_infos_columns = list( 11387 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11388 ) 11389 11390 # View results 11391 clause_select = [] 11392 clause_to_json = [] 11393 clause_to_format = [] 11394 for field in transcripts_infos_columns: 11395 # Do not consider INFO field for export into fields 11396 if field not in ["INFO"]: 11397 clause_select.append( 11398 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11399 ) 11400 clause_to_json.append(f""" '{field}': "{field}" """) 11401 clause_to_format.append(f""" "{field}" """) 11402 11403 # Update 11404 update_set_json = [] 11405 update_set_format = [] 11406 11407 # VCF header 11408 vcf_reader = self.get_header() 11409 11410 # Transcripts to info column in JSON 11411 if transcripts_info_json: 11412 11413 # Create column on variants table 11414 self.add_column( 11415 table_name=table_variants, 11416 column_name=transcripts_info_json, 11417 column_type="JSON", 11418 default_value=None, 11419 drop=False, 11420 ) 11421 11422 # Add header 11423 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11424 transcripts_info_json, 11425 ".", 11426 "String", 11427 "Transcripts in JSON format", 11428 "unknwon", 11429 "unknwon", 11430 self.code_type_map["String"], 11431 ) 11432 11433 # Add to update 11434 update_set_json.append( 11435 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11436 ) 11437 11438 # Transcripts to info field in JSON 11439 if transcripts_info_field_json: 11440 11441 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11442 11443 # Add to update 11444 update_set_json.append( 11445 f""" 11446 INFO = concat( 11447 CASE 11448 WHEN INFO NOT IN ('', '.') 11449 THEN INFO 11450 ELSE '' 11451 END, 11452 CASE 11453 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11454 THEN concat( 11455 ';{transcripts_info_field_json}=', 11456 t.{transcripts_info_json} 11457 ) 11458 ELSE '' 11459 END 11460 ) 11461 """ 11462 ) 11463 11464 # Add header 11465 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11466 transcripts_info_field_json, 11467 ".", 11468 "String", 11469 "Transcripts in JSON format", 11470 "unknwon", 11471 "unknwon", 11472 self.code_type_map["String"], 11473 ) 11474 11475 if update_set_json: 11476 11477 # Update query 11478 query_update = f""" 11479 UPDATE {table_variants} 11480 SET {", ".join(update_set_json)} 11481 FROM 11482 ( 11483 SELECT 11484 "#CHROM", POS, REF, ALT, 11485 concat( 11486 '{{', 11487 string_agg( 11488 '"' || "{transcripts_column_id}" || '":' || 11489 to_json(json_output) 11490 ), 11491 '}}' 11492 )::JSON AS {transcripts_info_json} 11493 FROM 11494 ( 11495 SELECT 11496 "#CHROM", POS, REF, ALT, 11497 "{transcripts_column_id}", 11498 to_json( 11499 {{{",".join(clause_to_json)}}} 11500 )::JSON AS json_output 11501 FROM 11502 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11503 WHERE "{transcripts_column_id}" IS NOT NULL 11504 ) 11505 GROUP BY "#CHROM", POS, REF, ALT 11506 ) AS t 11507 WHERE {table_variants}."#CHROM" = t."#CHROM" 11508 AND {table_variants}."POS" = t."POS" 11509 AND {table_variants}."REF" = t."REF" 11510 AND {table_variants}."ALT" = t."ALT" 11511 """ 11512 11513 self.execute_query(query=query_update) 11514 11515 # Transcripts to info column in FORMAT 11516 if transcripts_info_format: 11517 11518 # Create column on variants table 11519 self.add_column( 11520 table_name=table_variants, 11521 column_name=transcripts_info_format, 11522 column_type="VARCHAR", 11523 default_value=None, 11524 drop=False, 11525 ) 11526 11527 # Add header 11528 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11529 transcripts_info_format, 11530 ".", 11531 "String", 11532 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11533 "unknwon", 11534 "unknwon", 11535 self.code_type_map["String"], 11536 ) 11537 11538 # Add to update 11539 update_set_format.append( 11540 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11541 ) 11542 11543 else: 11544 11545 # Set variable for internal queries 11546 transcripts_info_format = "transcripts_info_format" 11547 11548 # Transcripts to info field in JSON 11549 if transcripts_info_field_format: 11550 11551 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11552 11553 # Add to update 11554 update_set_format.append( 11555 f""" 11556 INFO = concat( 11557 CASE 11558 WHEN INFO NOT IN ('', '.') 11559 THEN INFO 11560 ELSE '' 11561 END, 11562 CASE 11563 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11564 THEN concat( 11565 ';{transcripts_info_field_format}=', 11566 t.{transcripts_info_format} 11567 ) 11568 ELSE '' 11569 END 11570 ) 11571 """ 11572 ) 11573 11574 # Add header 11575 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11576 transcripts_info_field_format, 11577 ".", 11578 "String", 11579 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11580 "unknwon", 11581 "unknwon", 11582 self.code_type_map["String"], 11583 ) 11584 11585 if update_set_format: 11586 11587 # Update query 11588 query_update = f""" 11589 UPDATE {table_variants} 11590 SET {", ".join(update_set_format)} 11591 FROM 11592 ( 11593 SELECT 11594 "#CHROM", POS, REF, ALT, 11595 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11596 FROM 11597 ( 11598 SELECT 11599 "#CHROM", POS, REF, ALT, 11600 "{transcripts_column_id}", 11601 concat( 11602 "{transcripts_column_id}", 11603 '|', 11604 {", '|', ".join(clause_to_format)} 11605 ) AS {transcripts_info_format} 11606 FROM 11607 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11608 ) 11609 GROUP BY "#CHROM", POS, REF, ALT 11610 ) AS t 11611 WHERE {table_variants}."#CHROM" = t."#CHROM" 11612 AND {table_variants}."POS" = t."POS" 11613 AND {table_variants}."REF" = t."REF" 11614 AND {table_variants}."ALT" = t."ALT" 11615 """ 11616 11617 self.execute_query(query=query_update) 11618 11619 return True
38 def __init__( 39 self, 40 conn=None, 41 input: str = None, 42 output: str = None, 43 config: dict = {}, 44 param: dict = {}, 45 load: bool = False, 46 ) -> None: 47 """ 48 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 49 header 50 51 :param conn: the connection to the database 52 :param input: the input file 53 :param output: the output file 54 :param config: a dictionary containing the configuration of the model 55 :param param: a dictionary containing the parameters of the model 56 """ 57 58 # Init variables 59 self.init_variables() 60 61 # Input 62 self.set_input(input) 63 64 # Config 65 self.set_config(config) 66 67 # Param 68 self.set_param(param) 69 70 # Output 71 self.set_output(output) 72 73 # connexion 74 self.set_connexion(conn) 75 76 # Header 77 self.set_header() 78 79 # Samples 80 self.set_samples() 81 82 # Load data 83 if load: 84 self.load_data()
The function __init__ initializes the variables, sets the input, output, config, param, connexion and
header
Parameters
- conn: the connection to the database
- input: the input file
- output: the output file
- config: a dictionary containing the configuration of the model
- param: a dictionary containing the parameters of the model
86 def set_samples(self, samples: list = None) -> list: 87 """ 88 The function `set_samples` sets the samples attribute of an object to a provided list or 89 retrieves it from a parameter dictionary. 90 91 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 92 input and sets the `samples` attribute of the class to the provided list. If no samples are 93 provided, it tries to get the samples from the class's parameters using the `get_param` method 94 :type samples: list 95 :return: The `samples` list is being returned. 96 """ 97 98 if not samples: 99 samples = self.get_param().get("samples", {}).get("list", None) 100 101 self.samples = samples 102 103 return samples
The function set_samples sets the samples attribute of an object to a provided list or
retrieves it from a parameter dictionary.
Parameters
- samples: The
set_samplesmethod is a method of a class that takes a list of samples as input and sets thesamplesattribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using theget_parammethod
Returns
The
sampleslist is being returned.
105 def get_samples(self) -> list: 106 """ 107 This function returns a list of samples. 108 :return: The `get_samples` method is returning the `samples` attribute of the object. 109 """ 110 111 return self.samples
This function returns a list of samples.
Returns
The
get_samplesmethod is returning thesamplesattribute of the object.
113 def get_samples_check(self) -> bool: 114 """ 115 This function returns the value of the "check" key within the "samples" dictionary retrieved 116 from the parameters. 117 :return: The method `get_samples_check` is returning the value of the key "check" inside the 118 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 119 method. If the key "check" is not found, it will return `False`. 120 """ 121 122 return self.get_param().get("samples", {}).get("check", True)
This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.
Returns
The method
get_samples_checkis returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by theget_param()method. If the key "check" is not found, it will returnFalse.
124 def set_input(self, input: str = None) -> None: 125 """ 126 The function `set_input` takes a file name as input, extracts the name and extension, and sets 127 attributes in the class accordingly. 128 129 :param input: The `set_input` method in the provided code snippet is used to set attributes 130 related to the input file. Here's a breakdown of the parameters and their usage in the method: 131 :type input: str 132 """ 133 134 if input and not isinstance(input, str): 135 try: 136 self.input = input.name 137 except: 138 log.error(f"Input file '{input} in bad format") 139 raise ValueError(f"Input file '{input} in bad format") 140 else: 141 self.input = input 142 143 # Input format 144 if input: 145 input_name, input_extension = os.path.splitext(self.input) 146 self.input_name = input_name 147 self.input_extension = input_extension 148 self.input_format = self.input_extension.replace(".", "")
The function set_input takes a file name as input, extracts the name and extension, and sets
attributes in the class accordingly.
Parameters
- input: The
set_inputmethod in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
150 def set_config(self, config: dict) -> None: 151 """ 152 The set_config function takes a config object and assigns it as the configuration object for the 153 class. 154 155 :param config: The `config` parameter in the `set_config` function is a dictionary object that 156 contains configuration settings for the class. When you call the `set_config` function with a 157 dictionary object as the argument, it will set that dictionary as the configuration object for 158 the class 159 :type config: dict 160 """ 161 162 self.config = config
The set_config function takes a config object and assigns it as the configuration object for the class.
Parameters
- config: The
configparameter in theset_configfunction is a dictionary object that contains configuration settings for the class. When you call theset_configfunction with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
164 def set_param(self, param: dict) -> None: 165 """ 166 This function sets a parameter object for the class based on the input dictionary. 167 168 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 169 as the `param` attribute of the class instance 170 :type param: dict 171 """ 172 173 self.param = param
This function sets a parameter object for the class based on the input dictionary.
Parameters
- param: The
set_parammethod you provided takes a dictionary object as input and sets it as theparamattribute of the class instance
175 def init_variables(self) -> None: 176 """ 177 This function initializes the variables that will be used in the rest of the class 178 """ 179 180 self.prefix = "howard" 181 self.table_variants = "variants" 182 self.dataframe = None 183 184 self.comparison_map = { 185 "gt": ">", 186 "gte": ">=", 187 "lt": "<", 188 "lte": "<=", 189 "equals": "=", 190 "contains": "SIMILAR TO", 191 } 192 193 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 194 195 self.code_type_map_to_sql = { 196 "Integer": "INTEGER", 197 "String": "VARCHAR", 198 "Float": "FLOAT", 199 "Flag": "VARCHAR", 200 } 201 202 self.index_additionnal_fields = []
This function initializes the variables that will be used in the rest of the class
204 def get_indexing(self) -> bool: 205 """ 206 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 207 returns False. 208 :return: The value of the indexing parameter. 209 """ 210 211 return self.get_param().get("indexing", False)
It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.
Returns
The value of the indexing parameter.
213 def get_connexion_config(self) -> dict: 214 """ 215 The function `get_connexion_config` returns a dictionary containing the configuration for a 216 connection, including the number of threads and memory limit. 217 :return: a dictionary containing the configuration for the Connexion library. 218 """ 219 220 # config 221 config = self.get_config() 222 223 # Connexion config 224 connexion_config = {} 225 threads = self.get_threads() 226 227 # Threads 228 if threads: 229 connexion_config["threads"] = threads 230 231 # Memory 232 # if config.get("memory", None): 233 # connexion_config["memory_limit"] = config.get("memory") 234 if self.get_memory(): 235 connexion_config["memory_limit"] = self.get_memory() 236 237 # Temporary directory 238 if config.get("tmp", None): 239 connexion_config["temp_directory"] = config.get("tmp") 240 241 # Access 242 if config.get("access", None): 243 access = config.get("access") 244 if access in ["RO"]: 245 access = "READ_ONLY" 246 elif access in ["RW"]: 247 access = "READ_WRITE" 248 connexion_db = self.get_connexion_db() 249 if connexion_db in ":memory:": 250 access = "READ_WRITE" 251 connexion_config["access_mode"] = access 252 253 return connexion_config
The function get_connexion_config returns a dictionary containing the configuration for a
connection, including the number of threads and memory limit.
Returns
a dictionary containing the configuration for the Connexion library.
255 def get_duckdb_settings(self) -> dict: 256 """ 257 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 258 string. 259 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 260 """ 261 262 # config 263 config = self.get_config() 264 265 # duckdb settings 266 duckdb_settings_dict = {} 267 if config.get("duckdb_settings", None): 268 duckdb_settings = config.get("duckdb_settings") 269 duckdb_settings = full_path(duckdb_settings) 270 # duckdb setting is a file 271 if os.path.exists(duckdb_settings): 272 with open(duckdb_settings) as json_file: 273 duckdb_settings_dict = yaml.safe_load(json_file) 274 # duckdb settings is a string 275 else: 276 duckdb_settings_dict = json.loads(duckdb_settings) 277 278 return duckdb_settings_dict
The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a
string.
Returns
The function
get_duckdb_settingsreturns a dictionary objectduckdb_settings_dict.
280 def set_connexion_db(self) -> str: 281 """ 282 The function `set_connexion_db` returns the appropriate database connection string based on the 283 input format and connection type. 284 :return: the value of the variable `connexion_db`. 285 """ 286 287 # Default connexion db 288 default_connexion_db = ":memory:" 289 290 # Find connexion db 291 if self.get_input_format() in ["db", "duckdb"]: 292 connexion_db = self.get_input() 293 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 294 connexion_db = default_connexion_db 295 elif self.get_connexion_type() in ["tmpfile"]: 296 tmp_name = tempfile.mkdtemp( 297 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 298 ) 299 connexion_db = f"{tmp_name}/tmp.db" 300 elif self.get_connexion_type() != "": 301 connexion_db = self.get_connexion_type() 302 else: 303 connexion_db = default_connexion_db 304 305 # Set connexion db 306 self.connexion_db = connexion_db 307 308 return connexion_db
The function set_connexion_db returns the appropriate database connection string based on the
input format and connection type.
Returns
the value of the variable
connexion_db.
310 def set_connexion(self, conn) -> None: 311 """ 312 The function `set_connexion` creates a connection to a database, with options for different 313 database formats and settings. 314 315 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 316 database. If a connection is not provided, a new connection to an in-memory database is created. 317 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 318 sqlite 319 """ 320 321 # Connexion db 322 connexion_db = self.set_connexion_db() 323 324 # Connexion config 325 connexion_config = self.get_connexion_config() 326 327 # Connexion format 328 connexion_format = self.get_config().get("connexion_format", "duckdb") 329 # Set connexion format 330 self.connexion_format = connexion_format 331 332 # Connexion 333 if not conn: 334 if connexion_format in ["duckdb"]: 335 conn = duckdb.connect(connexion_db, config=connexion_config) 336 # duckDB settings 337 duckdb_settings = self.get_duckdb_settings() 338 if duckdb_settings: 339 for setting in duckdb_settings: 340 setting_value = duckdb_settings.get(setting) 341 if isinstance(setting_value, str): 342 setting_value = f"'{setting_value}'" 343 conn.execute(f"PRAGMA {setting}={setting_value};") 344 elif connexion_format in ["sqlite"]: 345 conn = sqlite3.connect(connexion_db) 346 347 # Set connexion 348 self.conn = conn 349 350 # Log 351 log.debug(f"connexion_format: {connexion_format}") 352 log.debug(f"connexion_db: {connexion_db}") 353 log.debug(f"connexion config: {connexion_config}") 354 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
The function set_connexion creates a connection to a database, with options for different
database formats and settings.
Parameters
- conn: The
connparameter in theset_connexionmethod is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
356 def set_output(self, output: str = None) -> None: 357 """ 358 The `set_output` function in Python sets the output file based on the input or a specified key 359 in the config file, extracting the output name, extension, and format. 360 361 :param output: The `output` parameter in the `set_output` method is used to specify the name of 362 the output file. If the config file has an 'output' key, the method sets the output to the value 363 of that key. If no output is provided, it sets the output to `None` 364 :type output: str 365 """ 366 367 if output and not isinstance(output, str): 368 self.output = output.name 369 else: 370 self.output = output 371 372 # Output format 373 if self.output: 374 output_name, output_extension = os.path.splitext(self.output) 375 self.output_name = output_name 376 self.output_extension = output_extension 377 self.output_format = self.output_extension.replace(".", "") 378 else: 379 self.output_name = None 380 self.output_extension = None 381 self.output_format = None
The set_output function in Python sets the output file based on the input or a specified key
in the config file, extracting the output name, extension, and format.
Parameters
- output: The
outputparameter in theset_outputmethod is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output toNone
383 def set_header(self) -> None: 384 """ 385 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 386 """ 387 388 input_file = self.get_input() 389 default_header_list = [ 390 "##fileformat=VCFv4.2", 391 "#CHROM POS ID REF ALT QUAL FILTER INFO", 392 ] 393 394 # Full path 395 input_file = full_path(input_file) 396 397 if input_file: 398 399 input_format = self.get_input_format() 400 input_compressed = self.get_input_compressed() 401 config = self.get_config() 402 header_list = default_header_list 403 if input_format in [ 404 "vcf", 405 "hdr", 406 "tsv", 407 "csv", 408 "psv", 409 "parquet", 410 "db", 411 "duckdb", 412 ]: 413 # header provided in param 414 if config.get("header_file", None): 415 with open(config.get("header_file"), "rt") as f: 416 header_list = self.read_vcf_header(f) 417 # within a vcf file format (header within input file itsself) 418 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 419 # within a compressed vcf file format (.vcf.gz) 420 if input_compressed: 421 with bgzf.open(input_file, "rt") as f: 422 header_list = self.read_vcf_header(f) 423 # within an uncompressed vcf file format (.vcf) 424 else: 425 with open(input_file, "rt") as f: 426 header_list = self.read_vcf_header(f) 427 # header provided in default external file .hdr 428 elif os.path.exists((input_file + ".hdr")): 429 with open(input_file + ".hdr", "rt") as f: 430 header_list = self.read_vcf_header(f) 431 else: 432 try: # Try to get header info fields and file columns 433 434 with tempfile.TemporaryDirectory() as tmpdir: 435 436 # Create database 437 db_for_header = Database(database=input_file) 438 439 # Get header columns for infos fields 440 db_header_from_columns = ( 441 db_for_header.get_header_from_columns() 442 ) 443 444 # Get real columns in the file 445 db_header_columns = db_for_header.get_columns() 446 447 # Write header file 448 header_file_tmp = os.path.join(tmpdir, "header") 449 f = open(header_file_tmp, "w") 450 vcf.Writer(f, db_header_from_columns) 451 f.close() 452 453 # Replace #CHROM line with rel columns 454 header_list = db_for_header.read_header_file( 455 header_file=header_file_tmp 456 ) 457 header_list[-1] = "\t".join(db_header_columns) 458 459 except: 460 461 log.warning( 462 f"No header for file {input_file}. Set as default VCF header" 463 ) 464 header_list = default_header_list 465 466 else: # try for unknown format ? 467 468 log.error(f"Input file format '{input_format}' not available") 469 raise ValueError(f"Input file format '{input_format}' not available") 470 471 if not header_list: 472 header_list = default_header_list 473 474 # header as list 475 self.header_list = header_list 476 477 # header as VCF object 478 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 479 480 else: 481 482 self.header_list = None 483 self.header_vcf = None
It reads the header of a VCF file and stores it as a list of strings and as a VCF object
485 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 486 """ 487 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 488 DataFrame based on the connection format. 489 490 :param query: The `query` parameter in the `get_query_to_df` function is a string that 491 represents the SQL query you want to execute. This query will be used to fetch data from a 492 database and convert it into a pandas DataFrame 493 :type query: str 494 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 495 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 496 function will only fetch up to that number of rows from the database query result. If no limit 497 is specified, 498 :type limit: int 499 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 500 """ 501 502 # Connexion format 503 connexion_format = self.get_connexion_format() 504 505 # Limit in query 506 if limit: 507 pd.set_option("display.max_rows", limit) 508 if connexion_format in ["duckdb"]: 509 df = ( 510 self.conn.execute(query) 511 .fetch_record_batch(limit) 512 .read_next_batch() 513 .to_pandas() 514 ) 515 elif connexion_format in ["sqlite"]: 516 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 517 518 # Full query 519 else: 520 if connexion_format in ["duckdb"]: 521 df = self.conn.execute(query).df() 522 elif connexion_format in ["sqlite"]: 523 df = pd.read_sql_query(query, self.conn) 524 525 return df
The get_query_to_df function takes a query as a string and returns the result as a pandas
DataFrame based on the connection format.
Parameters
- query: The
queryparameter in theget_query_to_dffunction is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame - limit: The
limitparameter in theget_query_to_dffunction is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns
A pandas DataFrame is being returned by the
get_query_to_dffunction.
527 def get_overview(self) -> None: 528 """ 529 The function prints the input, output, config, and dataframe of the current object 530 """ 531 table_variants_from = self.get_table_variants(clause="from") 532 sql_columns = self.get_header_columns_as_sql() 533 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 534 df = self.get_query_to_df(sql_query_export) 535 log.info( 536 "Input: " 537 + str(self.get_input()) 538 + " [" 539 + str(str(self.get_input_format())) 540 + "]" 541 ) 542 log.info( 543 "Output: " 544 + str(self.get_output()) 545 + " [" 546 + str(str(self.get_output_format())) 547 + "]" 548 ) 549 log.info("Config: ") 550 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 551 "\n" 552 ): 553 log.info("\t" + str(d)) 554 log.info("Param: ") 555 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 556 "\n" 557 ): 558 log.info("\t" + str(d)) 559 log.info("Sample list: " + str(self.get_header_sample_list())) 560 log.info("Dataframe: ") 561 for d in str(df).split("\n"): 562 log.info("\t" + str(d)) 563 564 # garbage collector 565 del df 566 gc.collect() 567 568 return None
The function prints the input, output, config, and dataframe of the current object
570 def get_stats(self) -> dict: 571 """ 572 The `get_stats` function calculates and returns various statistics of the current object, 573 including information about the input file, variants, samples, header fields, quality, and 574 SNVs/InDels. 575 :return: a dictionary containing various statistics of the current object. The dictionary has 576 the following structure: 577 """ 578 579 # Log 580 log.info(f"Stats Calculation...") 581 582 # table varaints 583 table_variants_from = self.get_table_variants() 584 585 # stats dict 586 stats = {"Infos": {}} 587 588 ### File 589 input_file = self.get_input() 590 stats["Infos"]["Input file"] = input_file 591 592 # Header 593 header_infos = self.get_header().infos 594 header_formats = self.get_header().formats 595 header_infos_list = list(header_infos) 596 header_formats_list = list(header_formats) 597 598 ### Variants 599 600 stats["Variants"] = {} 601 602 # Variants by chr 603 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 604 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 605 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 606 by=["CHROM"], kind="quicksort" 607 ) 608 609 # Total number of variants 610 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 611 612 # Calculate percentage 613 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 614 lambda x: (x / nb_of_variants) 615 ) 616 617 stats["Variants"]["Number of variants by chromosome"] = ( 618 nb_of_variants_by_chrom.to_dict(orient="index") 619 ) 620 621 stats["Infos"]["Number of variants"] = int(nb_of_variants) 622 623 ### Samples 624 625 # Init 626 samples = {} 627 nb_of_samples = 0 628 629 # Check Samples 630 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 631 log.debug(f"Check samples...") 632 for sample in self.get_header_sample_list(): 633 sql_query_samples = f""" 634 SELECT '{sample}' as sample, 635 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 636 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 637 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 638 FROM {table_variants_from} 639 WHERE ( 640 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 641 AND 642 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 643 ) 644 GROUP BY genotype 645 """ 646 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 647 sample_genotype_count = sql_query_genotype_df["count"].sum() 648 if len(sql_query_genotype_df): 649 nb_of_samples += 1 650 samples[f"{sample} - {sample_genotype_count} variants"] = ( 651 sql_query_genotype_df.to_dict(orient="index") 652 ) 653 654 stats["Samples"] = samples 655 stats["Infos"]["Number of samples"] = nb_of_samples 656 657 # # 658 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 659 # stats["Infos"]["Number of samples"] = nb_of_samples 660 # elif nb_of_samples: 661 # stats["Infos"]["Number of samples"] = "not a VCF format" 662 663 ### INFO and FORMAT fields 664 header_types_df = {} 665 header_types_list = { 666 "List of INFO fields": header_infos, 667 "List of FORMAT fields": header_formats, 668 } 669 i = 0 670 for header_type in header_types_list: 671 672 header_type_infos = header_types_list.get(header_type) 673 header_infos_dict = {} 674 675 for info in header_type_infos: 676 677 i += 1 678 header_infos_dict[i] = {} 679 680 # ID 681 header_infos_dict[i]["id"] = info 682 683 # num 684 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 685 if header_type_infos[info].num in genotype_map.keys(): 686 header_infos_dict[i]["Number"] = genotype_map.get( 687 header_type_infos[info].num 688 ) 689 else: 690 header_infos_dict[i]["Number"] = header_type_infos[info].num 691 692 # type 693 if header_type_infos[info].type: 694 header_infos_dict[i]["Type"] = header_type_infos[info].type 695 else: 696 header_infos_dict[i]["Type"] = "." 697 698 # desc 699 if header_type_infos[info].desc != None: 700 header_infos_dict[i]["Description"] = header_type_infos[info].desc 701 else: 702 header_infos_dict[i]["Description"] = "" 703 704 if len(header_infos_dict): 705 header_types_df[header_type] = pd.DataFrame.from_dict( 706 header_infos_dict, orient="index" 707 ).to_dict(orient="index") 708 709 # Stats 710 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 711 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 712 stats["Header"] = header_types_df 713 714 ### QUAL 715 if "QUAL" in self.get_header_columns(): 716 sql_query_qual = f""" 717 SELECT 718 avg(CAST(QUAL AS INTEGER)) AS Average, 719 min(CAST(QUAL AS INTEGER)) AS Minimum, 720 max(CAST(QUAL AS INTEGER)) AS Maximum, 721 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 722 median(CAST(QUAL AS INTEGER)) AS Median, 723 variance(CAST(QUAL AS INTEGER)) AS Variance 724 FROM {table_variants_from} 725 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 726 """ 727 728 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 729 stats["Quality"] = {"Stats": qual} 730 731 ### SNV and InDel 732 733 sql_query_snv = f""" 734 735 SELECT Type, count FROM ( 736 737 SELECT 738 'Total' AS Type, 739 count(*) AS count 740 FROM {table_variants_from} 741 742 UNION 743 744 SELECT 745 'MNV' AS Type, 746 count(*) AS count 747 FROM {table_variants_from} 748 WHERE len(REF) > 1 AND len(ALT) > 1 749 AND len(REF) = len(ALT) 750 751 UNION 752 753 SELECT 754 'InDel' AS Type, 755 count(*) AS count 756 FROM {table_variants_from} 757 WHERE len(REF) > 1 OR len(ALT) > 1 758 AND len(REF) != len(ALT) 759 760 UNION 761 762 SELECT 763 'SNV' AS Type, 764 count(*) AS count 765 FROM {table_variants_from} 766 WHERE len(REF) = 1 AND len(ALT) = 1 767 768 ) 769 770 ORDER BY count DESC 771 772 """ 773 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 774 775 sql_query_snv_substitution = f""" 776 SELECT 777 concat(REF, '>', ALT) AS 'Substitution', 778 count(*) AS count 779 FROM {table_variants_from} 780 WHERE len(REF) = 1 AND len(ALT) = 1 781 GROUP BY REF, ALT 782 ORDER BY count(*) DESC 783 """ 784 snv_substitution = ( 785 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 786 ) 787 stats["Variants"]["Counts"] = snv_indel 788 stats["Variants"]["Substitutions"] = snv_substitution 789 790 return stats
The get_stats function calculates and returns various statistics of the current object,
including information about the input file, variants, samples, header fields, quality, and
SNVs/InDels.
Returns
a dictionary containing various statistics of the current object. The dictionary has the following structure:
792 def stats_to_file(self, file: str = None) -> str: 793 """ 794 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 795 into a JSON object, and writes the JSON object to the specified file. 796 797 :param file: The `file` parameter is a string that represents the file path where the JSON data 798 will be written 799 :type file: str 800 :return: the name of the file that was written to. 801 """ 802 803 # Get stats 804 stats = self.get_stats() 805 806 # Serializing json 807 json_object = json.dumps(stats, indent=4) 808 809 # Writing to sample.json 810 with open(file, "w") as outfile: 811 outfile.write(json_object) 812 813 return file
The function stats_to_file takes a file name as input, retrieves statistics, serializes them
into a JSON object, and writes the JSON object to the specified file.
Parameters
- file: The
fileparameter is a string that represents the file path where the JSON data will be written
Returns
the name of the file that was written to.
815 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 816 """ 817 The `print_stats` function generates a markdown file and prints the statistics contained in a 818 JSON file in a formatted manner. 819 820 :param output_file: The `output_file` parameter is a string that specifies the path and filename 821 of the output file where the stats will be printed in Markdown format. If no `output_file` is 822 provided, a temporary directory will be created and the stats will be saved in a file named 823 "stats.md" within that 824 :type output_file: str 825 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 826 file where the statistics will be saved. If no value is provided, a temporary directory will be 827 created and a default file name "stats.json" will be used 828 :type json_file: str 829 :return: The function `print_stats` does not return any value. It has a return type annotation 830 of `None`. 831 """ 832 833 # Full path 834 output_file = full_path(output_file) 835 json_file = full_path(json_file) 836 837 with tempfile.TemporaryDirectory() as tmpdir: 838 839 # Files 840 if not output_file: 841 output_file = os.path.join(tmpdir, "stats.md") 842 if not json_file: 843 json_file = os.path.join(tmpdir, "stats.json") 844 845 # Create folders 846 if not os.path.exists(os.path.dirname(output_file)): 847 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 848 if not os.path.exists(os.path.dirname(json_file)): 849 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 850 851 # Create stats JSON file 852 stats_file = self.stats_to_file(file=json_file) 853 854 # Print stats file 855 with open(stats_file) as f: 856 stats = yaml.safe_load(f) 857 858 # Output 859 output_title = [] 860 output_index = [] 861 output = [] 862 863 # Title 864 output_title.append("# HOWARD Stats") 865 866 # Index 867 output_index.append("## Index") 868 869 # Process sections 870 for section in stats: 871 infos = stats.get(section) 872 section_link = "#" + section.lower().replace(" ", "-") 873 output.append(f"## {section}") 874 output_index.append(f"- [{section}]({section_link})") 875 876 if len(infos): 877 for info in infos: 878 try: 879 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 880 is_df = True 881 except: 882 try: 883 df = pd.DataFrame.from_dict( 884 json.loads((infos.get(info))), orient="index" 885 ) 886 is_df = True 887 except: 888 is_df = False 889 if is_df: 890 output.append(f"### {info}") 891 info_link = "#" + info.lower().replace(" ", "-") 892 output_index.append(f" - [{info}]({info_link})") 893 output.append(f"{df.to_markdown(index=False)}") 894 else: 895 output.append(f"- {info}: {infos.get(info)}") 896 else: 897 output.append(f"NA") 898 899 # Write stats in markdown file 900 with open(output_file, "w") as fp: 901 for item in output_title: 902 fp.write("%s\n" % item) 903 for item in output_index: 904 fp.write("%s\n" % item) 905 for item in output: 906 fp.write("%s\n" % item) 907 908 # Output stats in markdown 909 print("") 910 print("\n\n".join(output_title)) 911 print("") 912 print("\n\n".join(output)) 913 print("") 914 915 return None
The print_stats function generates a markdown file and prints the statistics contained in a
JSON file in a formatted manner.
Parameters
- output_file: The
output_fileparameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If nooutput_fileis provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that - json_file: The
json_fileparameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns
The function
print_statsdoes not return any value. It has a return type annotation ofNone.
917 def get_input(self) -> str: 918 """ 919 It returns the value of the input variable. 920 :return: The input is being returned. 921 """ 922 return self.input
It returns the value of the input variable.
Returns
The input is being returned.
924 def get_input_format(self, input_file: str = None) -> str: 925 """ 926 This function returns the format of the input variable, either from the provided input file or 927 by prompting for input. 928 929 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 930 represents the file path of the input file. If no `input_file` is provided when calling the 931 method, it will default to `None` 932 :type input_file: str 933 :return: The format of the input variable is being returned. 934 """ 935 936 if not input_file: 937 input_file = self.get_input() 938 input_format = get_file_format(input_file) 939 return input_format
This function returns the format of the input variable, either from the provided input file or by prompting for input.
Parameters
- input_file: The
input_fileparameter in theget_input_formatmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNone
Returns
The format of the input variable is being returned.
941 def get_input_compressed(self, input_file: str = None) -> str: 942 """ 943 The function `get_input_compressed` returns the format of the input variable after compressing 944 it. 945 946 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 947 that represents the file path of the input file. If no `input_file` is provided when calling the 948 method, it will default to `None` and the method will then call `self.get_input()` to 949 :type input_file: str 950 :return: The function `get_input_compressed` returns the compressed format of the input 951 variable. 952 """ 953 954 if not input_file: 955 input_file = self.get_input() 956 input_compressed = get_file_compressed(input_file) 957 return input_compressed
The function get_input_compressed returns the format of the input variable after compressing
it.
Parameters
- input_file: The
input_fileparameter in theget_input_compressedmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNoneand the method will then callself.get_input()to
Returns
The function
get_input_compressedreturns the compressed format of the input variable.
959 def get_output(self) -> str: 960 """ 961 It returns the output of the neuron. 962 :return: The output of the neural network. 963 """ 964 965 return self.output
It returns the output of the neuron.
Returns
The output of the neural network.
967 def get_output_format(self, output_file: str = None) -> str: 968 """ 969 The function `get_output_format` returns the format of the input variable or the output file if 970 provided. 971 972 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 973 that represents the file path of the output file. If no `output_file` is provided when calling 974 the method, it will default to the output obtained from the `get_output` method of the class 975 instance. The 976 :type output_file: str 977 :return: The format of the input variable is being returned. 978 """ 979 980 if not output_file: 981 output_file = self.get_output() 982 output_format = get_file_format(output_file) 983 984 return output_format
The function get_output_format returns the format of the input variable or the output file if
provided.
Parameters
- output_file: The
output_fileparameter in theget_output_formatmethod is a string that represents the file path of the output file. If nooutput_fileis provided when calling the method, it will default to the output obtained from theget_outputmethod of the class instance. The
Returns
The format of the input variable is being returned.
986 def get_config(self) -> dict: 987 """ 988 It returns the config 989 :return: The config variable is being returned. 990 """ 991 return self.config
It returns the config
Returns
The config variable is being returned.
993 def get_param(self) -> dict: 994 """ 995 It returns the param 996 :return: The param variable is being returned. 997 """ 998 return self.param
It returns the param
Returns
The param variable is being returned.
1000 def get_connexion_db(self) -> str: 1001 """ 1002 It returns the connexion_db attribute of the object 1003 :return: The connexion_db is being returned. 1004 """ 1005 return self.connexion_db
It returns the connexion_db attribute of the object
Returns
The connexion_db is being returned.
1007 def get_prefix(self) -> str: 1008 """ 1009 It returns the prefix of the object. 1010 :return: The prefix is being returned. 1011 """ 1012 return self.prefix
It returns the prefix of the object.
Returns
The prefix is being returned.
1014 def get_table_variants(self, clause: str = "select") -> str: 1015 """ 1016 This function returns the table_variants attribute of the object 1017 1018 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1019 defaults to select (optional) 1020 :return: The table_variants attribute of the object. 1021 """ 1022 1023 # Access 1024 access = self.get_config().get("access", None) 1025 1026 # Clauses "select", "where", "update" 1027 if clause in ["select", "where", "update"]: 1028 table_variants = self.table_variants 1029 # Clause "from" 1030 elif clause in ["from"]: 1031 # For Read Only 1032 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1033 input_file = self.get_input() 1034 table_variants = f"'{input_file}' as variants" 1035 # For Read Write 1036 else: 1037 table_variants = f"{self.table_variants} as variants" 1038 else: 1039 table_variants = self.table_variants 1040 return table_variants
This function returns the table_variants attribute of the object
Parameters
- clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns
The table_variants attribute of the object.
1042 def get_tmp_dir(self) -> str: 1043 """ 1044 The function `get_tmp_dir` returns the temporary directory path based on configuration 1045 parameters or a default path. 1046 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1047 configuration, parameters, and a default value of "/tmp". 1048 """ 1049 1050 return get_tmp( 1051 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1052 )
The function get_tmp_dir returns the temporary directory path based on configuration
parameters or a default path.
Returns
The
get_tmp_dirmethod is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".
1054 def get_connexion_type(self) -> str: 1055 """ 1056 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1057 1058 :return: The connexion type is being returned. 1059 """ 1060 return self.get_config().get("connexion_type", "memory")
If the connexion type is not in the list of allowed connexion types, raise a ValueError
Returns
The connexion type is being returned.
1062 def get_connexion(self): 1063 """ 1064 It returns the connection object 1065 1066 :return: The connection object. 1067 """ 1068 return self.conn
It returns the connection object
Returns
The connection object.
1070 def close_connexion(self) -> None: 1071 """ 1072 This function closes the connection to the database. 1073 :return: The connection is being closed. 1074 """ 1075 return self.conn.close()
This function closes the connection to the database.
Returns
The connection is being closed.
1077 def get_header(self, type: str = "vcf"): 1078 """ 1079 This function returns the header of the VCF file as a list of strings 1080 1081 :param type: the type of header you want to get, defaults to vcf (optional) 1082 :return: The header of the vcf file. 1083 """ 1084 1085 if self.header_vcf: 1086 if type == "vcf": 1087 return self.header_vcf 1088 elif type == "list": 1089 return self.header_list 1090 else: 1091 if type == "vcf": 1092 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1093 return header 1094 elif type == "list": 1095 return vcf_required
This function returns the header of the VCF file as a list of strings
Parameters
- type: the type of header you want to get, defaults to vcf (optional)
Returns
The header of the vcf file.
1097 def get_header_infos_list(self) -> list: 1098 """ 1099 This function retrieves a list of information fields from the header. 1100 :return: A list of information fields from the header. 1101 """ 1102 1103 # Init 1104 infos_list = [] 1105 1106 for field in self.get_header().infos: 1107 infos_list.append(field) 1108 1109 return infos_list
This function retrieves a list of information fields from the header.
Returns
A list of information fields from the header.
1111 def get_header_length(self, file: str = None) -> int: 1112 """ 1113 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1114 line. 1115 1116 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1117 header file. If this argument is provided, the function will read the header from the specified 1118 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1119 :type file: str 1120 :return: the length of the header list, excluding the #CHROM line. 1121 """ 1122 1123 if file: 1124 return len(self.read_vcf_header_file(file=file)) - 1 1125 elif self.get_header(type="list"): 1126 return len(self.get_header(type="list")) - 1 1127 else: 1128 return 0
The function get_header_length returns the length of the header list, excluding the #CHROM
line.
Parameters
- file: The
fileparameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns
the length of the header list, excluding the #CHROM line.
1130 def get_header_columns(self) -> str: 1131 """ 1132 This function returns the header list of a VCF 1133 1134 :return: The length of the header list. 1135 """ 1136 if self.get_header(): 1137 return self.get_header(type="list")[-1] 1138 else: 1139 return ""
This function returns the header list of a VCF
Returns
The length of the header list.
1141 def get_header_columns_as_list(self) -> list: 1142 """ 1143 This function returns the header list of a VCF 1144 1145 :return: The length of the header list. 1146 """ 1147 if self.get_header(): 1148 return self.get_header_columns().strip().split("\t") 1149 else: 1150 return []
This function returns the header list of a VCF
Returns
The length of the header list.
1152 def get_header_columns_as_sql(self) -> str: 1153 """ 1154 This function retruns header length (without #CHROM line) 1155 1156 :return: The length of the header list. 1157 """ 1158 sql_column_list = [] 1159 for col in self.get_header_columns_as_list(): 1160 sql_column_list.append(f'"{col}"') 1161 return ",".join(sql_column_list)
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1163 def get_header_sample_list( 1164 self, check: bool = False, samples: list = None, samples_force: bool = False 1165 ) -> list: 1166 """ 1167 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1168 checking and filtering based on input parameters. 1169 1170 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1171 parameter that determines whether to check if the samples in the list are properly defined as 1172 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1173 list is defined as a, defaults to False 1174 :type check: bool (optional) 1175 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1176 allows you to specify a subset of samples from the header. If you provide a list of sample 1177 names, the function will check if each sample is defined in the header. If a sample is not found 1178 in the 1179 :type samples: list 1180 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1181 a boolean parameter that determines whether to force the function to return the sample list 1182 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1183 function will return the sample list without performing, defaults to False 1184 :type samples_force: bool (optional) 1185 :return: The function `get_header_sample_list` returns a list of samples based on the input 1186 parameters and conditions specified in the function. 1187 """ 1188 1189 # Init 1190 samples_list = [] 1191 1192 if samples is None: 1193 samples_list = self.header_vcf.samples 1194 else: 1195 samples_checked = [] 1196 for sample in samples: 1197 if sample in self.header_vcf.samples: 1198 samples_checked.append(sample) 1199 else: 1200 log.warning(f"Sample '{sample}' not defined in header") 1201 samples_list = samples_checked 1202 1203 # Force sample list without checking if is_genotype_column 1204 if samples_force: 1205 log.warning(f"Samples {samples_list} not checked if genotypes") 1206 return samples_list 1207 1208 if check: 1209 samples_checked = [] 1210 for sample in samples_list: 1211 if self.is_genotype_column(column=sample): 1212 samples_checked.append(sample) 1213 else: 1214 log.warning( 1215 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1216 ) 1217 samples_list = samples_checked 1218 1219 # Return samples list 1220 return samples_list
The function get_header_sample_list returns a list of samples from a VCF header, with optional
checking and filtering based on input parameters.
Parameters
- check: The
checkparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. Ifcheckis set toTrue, the function will verify if each sample in the list is defined as a, defaults to False - samples: The
samplesparameter in theget_header_sample_listfunction is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the - samples_force: The
samples_forceparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. Ifsamples_forceis set toTrue, the function will return the sample list without performing, defaults to False
Returns
The function
get_header_sample_listreturns a list of samples based on the input parameters and conditions specified in the function.
1222 def is_genotype_column(self, column: str = None) -> bool: 1223 """ 1224 This function checks if a given column is a genotype column in a database. 1225 1226 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1227 represents the column name in a database table. This method checks if the specified column is a 1228 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1229 method of 1230 :type column: str 1231 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1232 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1233 column name and returns the result. If the `column` parameter is None, it returns False. 1234 """ 1235 1236 if column is not None: 1237 return Database(database=self.get_input()).is_genotype_column(column=column) 1238 else: 1239 return False
This function checks if a given column is a genotype column in a database.
Parameters
- column: The
columnparameter in theis_genotype_columnmethod is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls theis_genotype_columnmethod of
Returns
The
is_genotype_columnmethod is returning a boolean value. If thecolumnparameter is not None, it calls theis_genotype_columnmethod of theDatabaseclass with the specified column name and returns the result. If thecolumnparameter is None, it returns False.
1241 def get_verbose(self) -> bool: 1242 """ 1243 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1244 exist 1245 1246 :return: The value of the key "verbose" in the config dictionary. 1247 """ 1248 return self.get_config().get("verbose", False)
It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist
Returns
The value of the key "verbose" in the config dictionary.
1250 def get_connexion_format(self) -> str: 1251 """ 1252 It returns the connexion format of the object. 1253 :return: The connexion_format is being returned. 1254 """ 1255 connexion_format = self.connexion_format 1256 if connexion_format not in ["duckdb", "sqlite"]: 1257 log.error(f"Unknown connexion format {connexion_format}") 1258 raise ValueError(f"Unknown connexion format {connexion_format}") 1259 else: 1260 return connexion_format
It returns the connexion format of the object.
Returns
The connexion_format is being returned.
1262 def insert_file_to_table( 1263 self, 1264 file, 1265 columns: str, 1266 header_len: int = 0, 1267 sep: str = "\t", 1268 chunksize: int = 1000000, 1269 ) -> None: 1270 """ 1271 The function reads a file in chunks and inserts each chunk into a table based on the specified 1272 database format. 1273 1274 :param file: The `file` parameter is the file that you want to load into a table. It should be 1275 the path to the file on your system 1276 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1277 should contain the names of the columns in the table where the data will be inserted. The column 1278 names should be separated by commas within the string. For example, if you have columns named 1279 "id", "name 1280 :type columns: str 1281 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1282 the number of lines to skip at the beginning of the file before reading the actual data. This 1283 parameter allows you to skip any header information present in the file before processing the 1284 data, defaults to 0 1285 :type header_len: int (optional) 1286 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1287 separator character that is used in the file being read. In this case, the default separator is 1288 set to `\t`, which represents a tab character. You can change this parameter to a different 1289 separator character if, defaults to \t 1290 :type sep: str (optional) 1291 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1292 when processing the file in chunks. In the provided code snippet, the default value for 1293 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1294 to 1000000 1295 :type chunksize: int (optional) 1296 """ 1297 1298 # Config 1299 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1300 connexion_format = self.get_connexion_format() 1301 1302 log.debug("chunksize: " + str(chunksize)) 1303 1304 if chunksize: 1305 for chunk in pd.read_csv( 1306 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1307 ): 1308 if connexion_format in ["duckdb"]: 1309 sql_insert_into = ( 1310 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1311 ) 1312 self.conn.execute(sql_insert_into) 1313 elif connexion_format in ["sqlite"]: 1314 chunk.to_sql("variants", self.conn, if_exists="append", index=False)
The function reads a file in chunks and inserts each chunk into a table based on the specified database format.
Parameters
- file: The
fileparameter is the file that you want to load into a table. It should be the path to the file on your system - columns: The
columnsparameter in theinsert_file_to_tablefunction is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name - header_len: The
header_lenparameter in theinsert_file_to_tablefunction specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0 - sep: The
sepparameter in theinsert_file_to_tablefunction is used to specify the separator character that is used in the file being read. In this case, the default separator is set to, which represents a tab character. You can change this parameter to a different separator character if, defaults to - chunksize: The
chunksizeparameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value forchunksizeis set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
1316 def load_data( 1317 self, 1318 input_file: str = None, 1319 drop_variants_table: bool = False, 1320 sample_size: int = 20480, 1321 ) -> None: 1322 """ 1323 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1324 table before loading the data and specify a sample size. 1325 1326 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1327 table 1328 :type input_file: str 1329 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1330 determines whether the variants table should be dropped before loading the data. If set to 1331 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1332 not be dropped, defaults to False 1333 :type drop_variants_table: bool (optional) 1334 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1335 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1336 20480 1337 :type sample_size: int (optional) 1338 """ 1339 1340 log.info("Loading...") 1341 1342 # change input file 1343 if input_file: 1344 self.set_input(input_file) 1345 self.set_header() 1346 1347 # drop variants table 1348 if drop_variants_table: 1349 self.drop_variants_table() 1350 1351 # get table variants 1352 table_variants = self.get_table_variants() 1353 1354 # Access 1355 access = self.get_config().get("access", None) 1356 log.debug(f"access: {access}") 1357 1358 # Input format and compress 1359 input_format = self.get_input_format() 1360 input_compressed = self.get_input_compressed() 1361 log.debug(f"input_format: {input_format}") 1362 log.debug(f"input_compressed: {input_compressed}") 1363 1364 # input_compressed_format 1365 if input_compressed: 1366 input_compressed_format = "gzip" 1367 else: 1368 input_compressed_format = "none" 1369 log.debug(f"input_compressed_format: {input_compressed_format}") 1370 1371 # Connexion format 1372 connexion_format = self.get_connexion_format() 1373 1374 # Sample size 1375 if not sample_size: 1376 sample_size = -1 1377 log.debug(f"sample_size: {sample_size}") 1378 1379 # Load data 1380 log.debug(f"Load Data from {input_format}") 1381 1382 # DuckDB connexion 1383 if connexion_format in ["duckdb"]: 1384 1385 # Database already exists 1386 if self.input_format in ["db", "duckdb"]: 1387 1388 if connexion_format in ["duckdb"]: 1389 log.debug(f"Input file format '{self.input_format}' duckDB") 1390 else: 1391 log.error( 1392 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1393 ) 1394 raise ValueError( 1395 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1396 ) 1397 1398 # Load from existing database format 1399 else: 1400 1401 try: 1402 # Create Table or View 1403 database = Database(database=self.input) 1404 sql_from = database.get_sql_from(sample_size=sample_size) 1405 1406 if access in ["RO"]: 1407 sql_load = ( 1408 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1409 ) 1410 else: 1411 sql_load = ( 1412 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1413 ) 1414 self.conn.execute(sql_load) 1415 1416 except: 1417 # Format not available 1418 log.error(f"Input file format '{self.input_format}' not available") 1419 raise ValueError( 1420 f"Input file format '{self.input_format}' not available" 1421 ) 1422 1423 # SQLite connexion 1424 elif connexion_format in ["sqlite"] and input_format in [ 1425 "vcf", 1426 "tsv", 1427 "csv", 1428 "psv", 1429 ]: 1430 1431 # Main structure 1432 structure = { 1433 "#CHROM": "VARCHAR", 1434 "POS": "INTEGER", 1435 "ID": "VARCHAR", 1436 "REF": "VARCHAR", 1437 "ALT": "VARCHAR", 1438 "QUAL": "VARCHAR", 1439 "FILTER": "VARCHAR", 1440 "INFO": "VARCHAR", 1441 } 1442 1443 # Strcuture with samples 1444 structure_complete = structure 1445 if self.get_header_sample_list(): 1446 structure["FORMAT"] = "VARCHAR" 1447 for sample in self.get_header_sample_list(): 1448 structure_complete[sample] = "VARCHAR" 1449 1450 # Columns list for create and insert 1451 sql_create_table_columns = [] 1452 sql_create_table_columns_list = [] 1453 for column in structure_complete: 1454 column_type = structure_complete[column] 1455 sql_create_table_columns.append( 1456 f'"{column}" {column_type} default NULL' 1457 ) 1458 sql_create_table_columns_list.append(f'"{column}"') 1459 1460 # Create database 1461 log.debug(f"Create Table {table_variants}") 1462 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1463 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1464 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1465 self.conn.execute(sql_create_table) 1466 1467 # chunksize define length of file chunk load file 1468 chunksize = 100000 1469 1470 # delimiter 1471 delimiter = file_format_delimiters.get(input_format, "\t") 1472 1473 # Load the input file 1474 with open(self.input, "rt") as input_file: 1475 1476 # Use the appropriate file handler based on the input format 1477 if input_compressed: 1478 input_file = bgzf.open(self.input, "rt") 1479 if input_format in ["vcf"]: 1480 header_len = self.get_header_length() 1481 else: 1482 header_len = 0 1483 1484 # Insert the file contents into a table 1485 self.insert_file_to_table( 1486 input_file, 1487 columns=sql_create_table_columns_list_sql, 1488 header_len=header_len, 1489 sep=delimiter, 1490 chunksize=chunksize, 1491 ) 1492 1493 else: 1494 log.error( 1495 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1496 ) 1497 raise ValueError( 1498 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1499 ) 1500 1501 # Explode INFOS fields into table fields 1502 if self.get_explode_infos(): 1503 self.explode_infos( 1504 prefix=self.get_explode_infos_prefix(), 1505 fields=self.get_explode_infos_fields(), 1506 force=True, 1507 ) 1508 1509 # Create index after insertion 1510 self.create_indexes()
The load_data function reads a VCF file and inserts it into a table, with options to drop the
table before loading the data and specify a sample size.
Parameters
- input_file: The path to the input file. This is the VCF file that will be loaded into the table
- drop_variants_table: The
drop_variants_tableparameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set toTrue, the variants table will be dropped. If set toFalse(default), the variants table will not be dropped, defaults to False - sample_size: The
sample_sizeparameter determines the number of rows to be sampled from the input file. If it is set toNone, the default value of 20480 will be used, defaults to 20480
1512 def get_explode_infos(self) -> bool: 1513 """ 1514 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1515 to False if it is not set. 1516 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1517 value. If the parameter is not present, it will return False. 1518 """ 1519 1520 return self.get_param().get("explode", {}).get("explode_infos", False)
The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting
to False if it is not set.
Returns
The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.
1522 def get_explode_infos_fields( 1523 self, 1524 explode_infos_fields: str = None, 1525 remove_fields_not_in_header: bool = False, 1526 ) -> list: 1527 """ 1528 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1529 the input parameter `explode_infos_fields`. 1530 1531 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1532 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1533 comma-separated list of field names to explode 1534 :type explode_infos_fields: str 1535 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1536 flag that determines whether to remove fields that are not present in the header. If it is set 1537 to `True`, any field that is not in the header will be excluded from the list of exploded 1538 information fields. If it is set to `, defaults to False 1539 :type remove_fields_not_in_header: bool (optional) 1540 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1541 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1542 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1543 Otherwise, it returns a list of exploded information fields after removing any spaces and 1544 splitting the string by commas. 1545 """ 1546 1547 # If no fields, get it in param 1548 if not explode_infos_fields: 1549 explode_infos_fields = ( 1550 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1551 ) 1552 1553 # If no fields, defined as all fields in header using keyword 1554 if not explode_infos_fields: 1555 explode_infos_fields = "*" 1556 1557 # If fields list not empty 1558 if explode_infos_fields: 1559 1560 # Input fields list 1561 if isinstance(explode_infos_fields, str): 1562 fields_input = explode_infos_fields.split(",") 1563 elif isinstance(explode_infos_fields, list): 1564 fields_input = explode_infos_fields 1565 else: 1566 fields_input = [] 1567 1568 # Fields list without * keyword 1569 fields_without_all = fields_input.copy() 1570 if "*".casefold() in (item.casefold() for item in fields_without_all): 1571 fields_without_all.remove("*") 1572 1573 # Fields in header 1574 fields_in_header = sorted(list(set(self.get_header().infos))) 1575 1576 # Construct list of fields 1577 fields_output = [] 1578 for field in fields_input: 1579 1580 # Strip field 1581 field = field.strip() 1582 1583 # format keyword * in regex 1584 if field.upper() in ["*"]: 1585 field = ".*" 1586 1587 # Find all fields with pattern 1588 r = re.compile(field) 1589 fields_search = sorted(list(filter(r.match, fields_in_header))) 1590 1591 # Remove fields input from search 1592 if field in fields_search: 1593 fields_search = [field] 1594 elif fields_search != [field]: 1595 fields_search = sorted( 1596 list(set(fields_search).difference(fields_input)) 1597 ) 1598 1599 # If field is not in header (avoid not well formatted header) 1600 if not fields_search and not remove_fields_not_in_header: 1601 fields_search = [field] 1602 1603 # Add found fields 1604 for new_field in fields_search: 1605 # Add field, if not already exists, and if it is in header (if asked) 1606 if ( 1607 new_field not in fields_output 1608 and ( 1609 not remove_fields_not_in_header 1610 or new_field in fields_in_header 1611 ) 1612 and new_field not in [".*"] 1613 ): 1614 fields_output.append(new_field) 1615 1616 return fields_output 1617 1618 else: 1619 1620 return []
The get_explode_infos_fields function returns a list of exploded information fields based on
the input parameter explode_infos_fields.
Parameters
- explode_infos_fields: The
explode_infos_fieldsparameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode - remove_fields_not_in_header: The parameter
remove_fields_not_in_headeris a boolean flag that determines whether to remove fields that are not present in the header. If it is set toTrue, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns
The function
get_explode_infos_fieldsreturns a list of exploded information fields. If theexplode_infos_fieldsparameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.
1622 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1623 """ 1624 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1625 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1626 not provided. 1627 1628 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1629 prefix to be used for exploding or expanding information 1630 :type explode_infos_prefix: str 1631 :return: the value of the variable `explode_infos_prefix`. 1632 """ 1633 1634 if not explode_infos_prefix: 1635 explode_infos_prefix = ( 1636 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1637 ) 1638 1639 return explode_infos_prefix
The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or
the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is
not provided.
Parameters
- explode_infos_prefix: The parameter
explode_infos_prefixis a string that specifies a prefix to be used for exploding or expanding information
Returns
the value of the variable
explode_infos_prefix.
1641 def add_column( 1642 self, 1643 table_name, 1644 column_name, 1645 column_type, 1646 default_value=None, 1647 drop: bool = False, 1648 ) -> dict: 1649 """ 1650 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1651 doesn't already exist. 1652 1653 :param table_name: The name of the table to which you want to add a column 1654 :param column_name: The parameter "column_name" is the name of the column that you want to add 1655 to the table 1656 :param column_type: The `column_type` parameter specifies the data type of the column that you 1657 want to add to the table. It should be a string that represents the desired data type, such as 1658 "INTEGER", "TEXT", "REAL", etc 1659 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1660 default value for the newly added column. If a default value is provided, it will be assigned to 1661 the column for any existing rows that do not have a value for that column 1662 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1663 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1664 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1665 to False 1666 :type drop: bool (optional) 1667 :return: a boolean value indicating whether the column was successfully added to the table. 1668 """ 1669 1670 # added 1671 added = False 1672 dropped = False 1673 1674 # Check if the column already exists in the table 1675 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1676 columns = self.get_query_to_df(query).columns.tolist() 1677 if column_name.upper() in [c.upper() for c in columns]: 1678 log.debug( 1679 f"The {column_name} column already exists in the {table_name} table" 1680 ) 1681 if drop: 1682 self.drop_column(table_name=table_name, column_name=column_name) 1683 dropped = True 1684 else: 1685 return None 1686 else: 1687 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1688 1689 # Add column in table 1690 add_column_query = ( 1691 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1692 ) 1693 if default_value is not None: 1694 add_column_query += f" DEFAULT {default_value}" 1695 self.execute_query(add_column_query) 1696 added = not dropped 1697 log.debug( 1698 f"The {column_name} column was successfully added to the {table_name} table" 1699 ) 1700 1701 if added: 1702 added_column = { 1703 "table_name": table_name, 1704 "column_name": column_name, 1705 "column_type": column_type, 1706 "default_value": default_value, 1707 } 1708 else: 1709 added_column = None 1710 1711 return added_column
The add_column function adds a column to a SQLite or DuckDB table with a default value if it
doesn't already exist.
Parameters
- table_name: The name of the table to which you want to add a column
- column_name: The parameter "column_name" is the name of the column that you want to add to the table
- column_type: The
column_typeparameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc - default_value: The
default_valueparameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column - drop: The
dropparameter is a boolean flag that determines whether to drop the column if it already exists in the table. Ifdropis set toTrue, the function will drop the existing column before adding the new column. Ifdropis set toFalse(default),, defaults to False
Returns
a boolean value indicating whether the column was successfully added to the table.
1713 def drop_column( 1714 self, column: dict = None, table_name: str = None, column_name: str = None 1715 ) -> bool: 1716 """ 1717 The `drop_column` function drops a specified column from a given table in a database and returns 1718 True if the column was successfully dropped, and False if the column does not exist in the 1719 table. 1720 1721 :param column: The `column` parameter is a dictionary that contains information about the column 1722 you want to drop. It has two keys: 1723 :type column: dict 1724 :param table_name: The `table_name` parameter is the name of the table from which you want to 1725 drop a column 1726 :type table_name: str 1727 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1728 from the table 1729 :type column_name: str 1730 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1731 and False if the column does not exist in the table. 1732 """ 1733 1734 # Find column infos 1735 if column: 1736 if isinstance(column, dict): 1737 table_name = column.get("table_name", None) 1738 column_name = column.get("column_name", None) 1739 elif isinstance(column, str): 1740 table_name = self.get_table_variants() 1741 column_name = column 1742 else: 1743 table_name = None 1744 column_name = None 1745 1746 if not table_name and not column_name: 1747 return False 1748 1749 # Removed 1750 removed = False 1751 1752 # Check if the column already exists in the table 1753 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1754 columns = self.get_query_to_df(query).columns.tolist() 1755 if column_name in columns: 1756 log.debug(f"The {column_name} column exists in the {table_name} table") 1757 else: 1758 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1759 return False 1760 1761 # Add column in table # ALTER TABLE integers DROP k 1762 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1763 self.execute_query(add_column_query) 1764 removed = True 1765 log.debug( 1766 f"The {column_name} column was successfully dropped to the {table_name} table" 1767 ) 1768 1769 return removed
The drop_column function drops a specified column from a given table in a database and returns
True if the column was successfully dropped, and False if the column does not exist in the
table.
Parameters
- column: The
columnparameter is a dictionary that contains information about the column you want to drop. It has two keys: - table_name: The
table_nameparameter is the name of the table from which you want to drop a column - column_name: The
column_nameparameter is the name of the column that you want to drop from the table
Returns
a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.
1771 def explode_infos( 1772 self, 1773 prefix: str = None, 1774 create_index: bool = False, 1775 fields: list = None, 1776 force: bool = False, 1777 proccess_all_fields_together: bool = False, 1778 table: str = None, 1779 ) -> list: 1780 """ 1781 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1782 individual columns, returning a list of added columns. 1783 1784 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1785 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1786 `self.get_explode_infos_prefix()` as the prefix 1787 :type prefix: str 1788 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1789 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1790 `False`, indexes will not be created. The default value is `False`, defaults to False 1791 :type create_index: bool (optional) 1792 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1793 that you want to explode into individual columns. If this parameter is not provided, all INFO 1794 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1795 a list to the ` 1796 :type fields: list 1797 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1798 determines whether to drop and recreate a column if it already exists in the table. If `force` 1799 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1800 defaults to False 1801 :type force: bool (optional) 1802 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1803 flag that determines whether to process all the INFO fields together or individually. If set to 1804 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1805 be processed individually. The default value is, defaults to False 1806 :type proccess_all_fields_together: bool (optional) 1807 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1808 of the table where the exploded INFO fields will be added as individual columns. If you provide 1809 a value for the `table` parameter, the function will use that table name. If the `table` 1810 parameter is 1811 :type table: str 1812 :return: The `explode_infos` function returns a list of added columns. 1813 """ 1814 1815 # drop indexes 1816 self.drop_indexes() 1817 1818 # connexion format 1819 connexion_format = self.get_connexion_format() 1820 1821 # Access 1822 access = self.get_config().get("access", None) 1823 1824 # Added columns 1825 added_columns = [] 1826 1827 if access not in ["RO"]: 1828 1829 # prefix 1830 if prefix in [None, True] or not isinstance(prefix, str): 1831 if self.get_explode_infos_prefix() not in [None, True]: 1832 prefix = self.get_explode_infos_prefix() 1833 else: 1834 prefix = "INFO/" 1835 1836 # table variants 1837 if table is not None: 1838 table_variants = table 1839 else: 1840 table_variants = self.get_table_variants(clause="select") 1841 1842 # extra infos 1843 try: 1844 extra_infos = self.get_extra_infos() 1845 except: 1846 extra_infos = [] 1847 1848 # Header infos 1849 header_infos = self.get_header().infos 1850 1851 log.debug( 1852 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1853 ) 1854 1855 sql_info_alter_table_array = [] 1856 1857 # Info fields to check 1858 fields_list = list(header_infos) 1859 if fields: 1860 fields_list += fields 1861 fields_list = set(fields_list) 1862 1863 # If no fields 1864 if not fields: 1865 fields = [] 1866 1867 # Translate fields if patterns 1868 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1869 1870 for info in fields: 1871 1872 info_id_sql = prefix + info 1873 1874 if ( 1875 info in fields_list 1876 or prefix + info in fields_list 1877 or info in extra_infos 1878 ): 1879 1880 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1881 1882 if info in header_infos: 1883 info_type = header_infos[info].type 1884 info_num = header_infos[info].num 1885 else: 1886 info_type = "String" 1887 info_num = 0 1888 1889 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1890 if info_num != 1: 1891 type_sql = "VARCHAR" 1892 1893 # Add field 1894 added_column = self.add_column( 1895 table_name=table_variants, 1896 column_name=info_id_sql, 1897 column_type=type_sql, 1898 default_value="null", 1899 drop=force, 1900 ) 1901 1902 if added_column: 1903 added_columns.append(added_column) 1904 1905 if added_column or force: 1906 1907 # add field to index 1908 self.index_additionnal_fields.append(info_id_sql) 1909 1910 # Update field array 1911 if connexion_format in ["duckdb"]: 1912 update_info_field = f""" 1913 "{info_id_sql}" = 1914 CASE 1915 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1916 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1917 END 1918 """ 1919 elif connexion_format in ["sqlite"]: 1920 update_info_field = f""" 1921 "{info_id_sql}" = 1922 CASE 1923 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1924 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1925 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1926 END 1927 """ 1928 1929 sql_info_alter_table_array.append(update_info_field) 1930 1931 if sql_info_alter_table_array: 1932 1933 # By chromosomes 1934 try: 1935 chromosomes_list = list( 1936 self.get_query_to_df( 1937 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1938 )["#CHROM"] 1939 ) 1940 except: 1941 chromosomes_list = [None] 1942 1943 for chrom in chromosomes_list: 1944 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1945 1946 # Where clause 1947 where_clause = "" 1948 if chrom and len(chromosomes_list) > 1: 1949 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1950 1951 # Update table 1952 if proccess_all_fields_together: 1953 sql_info_alter_table_array_join = ", ".join( 1954 sql_info_alter_table_array 1955 ) 1956 if sql_info_alter_table_array_join: 1957 sql_info_alter_table = f""" 1958 UPDATE {table_variants} 1959 SET {sql_info_alter_table_array_join} 1960 {where_clause} 1961 """ 1962 log.debug( 1963 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1964 ) 1965 # log.debug(sql_info_alter_table) 1966 self.conn.execute(sql_info_alter_table) 1967 else: 1968 sql_info_alter_num = 0 1969 for sql_info_alter in sql_info_alter_table_array: 1970 sql_info_alter_num += 1 1971 sql_info_alter_table = f""" 1972 UPDATE {table_variants} 1973 SET {sql_info_alter} 1974 {where_clause} 1975 """ 1976 log.debug( 1977 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1978 ) 1979 # log.debug(sql_info_alter_table) 1980 self.conn.execute(sql_info_alter_table) 1981 1982 # create indexes 1983 if create_index: 1984 self.create_indexes() 1985 1986 return added_columns
The explode_infos function in Python takes a VCF file and explodes the INFO fields into
individual columns, returning a list of added columns.
Parameters
- prefix: The
prefixparameter is a string that is used as a prefix for the exploded INFO fields. If theprefixis not provided or is set toNone, the function will use the value ofself.get_explode_infos_prefix()as the prefix - create_index: The
create_indexparameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set toTrue, indexes will be created; if set toFalse, indexes will not be created. The default value isFalse, defaults to False - fields: The
fieldsparameter in theexplode_infosfunction is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the ` - force: The
forceparameter in theexplode_infosfunction is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. Ifforceis set toTrue, the column will be dropped and recreated. Ifforceis set to `False, defaults to False - proccess_all_fields_together: The
proccess_all_fields_togetherparameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set toTrue, all the INFO fields will be processed together. If set toFalse, each INFO field will be processed individually. The default value is, defaults to False - table: The
tableparameter in theexplode_infosfunction is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for thetableparameter, the function will use that table name. If thetableparameter is
Returns
The
explode_infosfunction returns a list of added columns.
1988 def create_indexes(self) -> None: 1989 """ 1990 Create indexes on the table after insertion 1991 """ 1992 1993 # Access 1994 access = self.get_config().get("access", None) 1995 1996 # get table variants 1997 table_variants = self.get_table_variants("FROM") 1998 1999 if self.get_indexing() and access not in ["RO"]: 2000 # Create index 2001 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2002 self.conn.execute(sql_create_table_index) 2003 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2004 self.conn.execute(sql_create_table_index) 2005 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2006 self.conn.execute(sql_create_table_index) 2007 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2008 self.conn.execute(sql_create_table_index) 2009 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2010 self.conn.execute(sql_create_table_index) 2011 for field in self.index_additionnal_fields: 2012 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2013 self.conn.execute(sql_create_table_index)
Create indexes on the table after insertion
2015 def drop_indexes(self) -> None: 2016 """ 2017 Create indexes on the table after insertion 2018 """ 2019 2020 # Access 2021 access = self.get_config().get("access", None) 2022 2023 # get table variants 2024 table_variants = self.get_table_variants("FROM") 2025 2026 # Get database format 2027 connexion_format = self.get_connexion_format() 2028 2029 if access not in ["RO"]: 2030 if connexion_format in ["duckdb"]: 2031 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2032 elif connexion_format in ["sqlite"]: 2033 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2034 2035 list_indexes = self.conn.execute(sql_list_indexes) 2036 index_names = [row[0] for row in list_indexes.fetchall()] 2037 for index in index_names: 2038 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2039 self.conn.execute(sql_drop_table_index)
Create indexes on the table after insertion
2041 def read_vcf_header(self, f) -> list: 2042 """ 2043 It reads the header of a VCF file and returns a list of the header lines 2044 2045 :param f: the file object 2046 :return: The header lines of the VCF file. 2047 """ 2048 2049 header_list = [] 2050 for line in f: 2051 header_list.append(line) 2052 if line.startswith("#CHROM"): 2053 break 2054 return header_list
It reads the header of a VCF file and returns a list of the header lines
Parameters
- f: the file object
Returns
The header lines of the VCF file.
2056 def read_vcf_header_file(self, file: str = None) -> list: 2057 """ 2058 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2059 uncompressed files. 2060 2061 :param file: The `file` parameter is a string that represents the path to the VCF header file 2062 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2063 default to `None` 2064 :type file: str 2065 :return: The function `read_vcf_header_file` returns a list. 2066 """ 2067 2068 if self.get_input_compressed(input_file=file): 2069 with bgzf.open(file, "rt") as f: 2070 return self.read_vcf_header(f=f) 2071 else: 2072 with open(file, "rt") as f: 2073 return self.read_vcf_header(f=f)
The read_vcf_header_file function reads the header of a VCF file, handling both compressed and
uncompressed files.
Parameters
- file: The
fileparameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default toNone
Returns
The function
read_vcf_header_filereturns a list.
2075 def execute_query(self, query: str): 2076 """ 2077 It takes a query as an argument, executes it, and returns the results 2078 2079 :param query: The query to be executed 2080 :return: The result of the query is being returned. 2081 """ 2082 if query: 2083 return self.conn.execute(query) # .fetchall() 2084 else: 2085 return None
It takes a query as an argument, executes it, and returns the results
Parameters
- query: The query to be executed
Returns
The result of the query is being returned.
2087 def export_output( 2088 self, 2089 output_file: str | None = None, 2090 output_header: str | None = None, 2091 export_header: bool = True, 2092 query: str | None = None, 2093 parquet_partitions: list | None = None, 2094 chunk_size: int | None = None, 2095 threads: int | None = None, 2096 sort: bool = False, 2097 index: bool = False, 2098 order_by: str | None = None, 2099 ) -> bool: 2100 """ 2101 The `export_output` function exports data from a VCF file to a specified output file in various 2102 formats, including VCF, CSV, TSV, PSV, and Parquet. 2103 2104 :param output_file: The `output_file` parameter is a string that specifies the name of the 2105 output file to be generated by the function. This is where the exported data will be saved 2106 :type output_file: str 2107 :param output_header: The `output_header` parameter is a string that specifies the name of the 2108 file where the header of the VCF file will be exported. If this parameter is not provided, the 2109 header will be exported to a file with the same name as the `output_file` parameter, but with 2110 the extension " 2111 :type output_header: str 2112 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2113 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2114 True, the header will be exported to a file. If `export_header` is False, the header will not 2115 be, defaults to True, if output format is not VCF 2116 :type export_header: bool (optional) 2117 :param query: The `query` parameter is an optional SQL query that can be used to filter and 2118 select specific data from the VCF file before exporting it. If provided, only the data that 2119 matches the query will be exported 2120 :type query: str 2121 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2122 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2123 organize data in a hierarchical directory structure based on the values of one or more columns. 2124 This can improve query performance when working with large datasets 2125 :type parquet_partitions: list 2126 :param chunk_size: The `chunk_size` parameter specifies the number of 2127 records in batch when exporting data in Parquet format. This parameter is used for 2128 partitioning the Parquet file into multiple files. 2129 :type chunk_size: int 2130 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2131 threads to be used during the export process. It determines the level of parallelism and can 2132 improve the performance of the export operation. If not provided, the function will use the 2133 default number of threads 2134 :type threads: int 2135 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2136 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2137 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2138 False 2139 :type sort: bool (optional) 2140 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2141 created on the output file. If `index` is True, an index will be created. If `index` is False, 2142 no index will be created. The default value is False, defaults to False 2143 :type index: bool (optional) 2144 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2145 sorting the output file. This parameter is only applicable when exporting data in VCF format 2146 :type order_by: str 2147 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2148 None if it doesn't. 2149 """ 2150 2151 # Log 2152 log.info("Exporting...") 2153 2154 # Full path 2155 output_file = full_path(output_file) 2156 output_header = full_path(output_header) 2157 2158 # Config 2159 config = self.get_config() 2160 2161 # Param 2162 param = self.get_param() 2163 2164 # Tmp files to remove 2165 tmp_to_remove = [] 2166 2167 # If no output, get it 2168 if not output_file: 2169 output_file = self.get_output() 2170 2171 # If not threads 2172 if not threads: 2173 threads = self.get_threads() 2174 2175 # Auto header name with extension 2176 if export_header or output_header: 2177 if not output_header: 2178 output_header = f"{output_file}.hdr" 2179 # Export header 2180 self.export_header(output_file=output_file) 2181 2182 # Switch off export header if VCF output 2183 output_file_type = get_file_format(output_file) 2184 if output_file_type in ["vcf"]: 2185 export_header = False 2186 tmp_to_remove.append(output_header) 2187 2188 # Chunk size 2189 if not chunk_size: 2190 chunk_size = config.get("chunk_size", None) 2191 2192 # Parquet partition 2193 if not parquet_partitions: 2194 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2195 if parquet_partitions and isinstance(parquet_partitions, str): 2196 parquet_partitions = parquet_partitions.split(",") 2197 2198 # Order by 2199 if not order_by: 2200 order_by = param.get("export", {}).get("order_by", "") 2201 2202 # Header in output 2203 header_in_output = param.get("export", {}).get("include_header", False) 2204 2205 # Database 2206 database_source = self.get_connexion() 2207 2208 # Connexion format 2209 connexion_format = self.get_connexion_format() 2210 2211 # Explode infos 2212 if self.get_explode_infos(): 2213 self.explode_infos( 2214 prefix=self.get_explode_infos_prefix(), 2215 fields=self.get_explode_infos_fields(), 2216 force=False, 2217 ) 2218 2219 # if connexion_format in ["sqlite"] or query: 2220 if connexion_format in ["sqlite"]: 2221 2222 # Export in Parquet 2223 random_tmp = "".join( 2224 random.choice(string.ascii_lowercase) for i in range(10) 2225 ) 2226 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2227 tmp_to_remove.append(database_source) 2228 2229 # Table Variants 2230 table_variants = self.get_table_variants() 2231 2232 # Create export query 2233 sql_query_export_subquery = f""" 2234 SELECT * FROM {table_variants} 2235 """ 2236 2237 # Write source file 2238 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2239 2240 # Create database 2241 database = Database( 2242 database=database_source, 2243 table="variants", 2244 header_file=output_header, 2245 conn_config=self.get_connexion_config(), 2246 ) 2247 2248 # Existing colomns header 2249 existing_columns_header = database.get_header_columns_from_database(query=query) 2250 2251 # Sample list 2252 if output_file_type in ["vcf"]: 2253 get_samples = self.get_samples() 2254 get_samples_check = self.get_samples_check() 2255 samples_force = get_samples is not None 2256 sample_list = self.get_header_sample_list( 2257 check=get_samples_check, 2258 samples=get_samples, 2259 samples_force=samples_force, 2260 ) 2261 else: 2262 sample_list = None 2263 2264 # Export file 2265 database.export( 2266 output_database=output_file, 2267 output_header=output_header, 2268 existing_columns_header=existing_columns_header, 2269 parquet_partitions=parquet_partitions, 2270 chunk_size=chunk_size, 2271 threads=threads, 2272 sort=sort, 2273 index=index, 2274 header_in_output=header_in_output, 2275 order_by=order_by, 2276 query=query, 2277 export_header=export_header, 2278 sample_list=sample_list, 2279 ) 2280 2281 # Remove 2282 remove_if_exists(tmp_to_remove) 2283 2284 return (os.path.exists(output_file) or None) and ( 2285 os.path.exists(output_file) or None 2286 )
The export_output function exports data from a VCF file to a specified output file in various
formats, including VCF, CSV, TSV, PSV, and Parquet.
Parameters
- output_file: The
output_fileparameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved - output_header: The
output_headerparameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as theoutput_fileparameter, but with the extension " - export_header: The
export_headerparameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. Ifexport_headeris True, the header will be exported to a file. Ifexport_headeris False, the header will not be, defaults to True, if output format is not VCF - query: The
queryparameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported - parquet_partitions: The
parquet_partitionsparameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets - chunk_size: The
chunk_sizeparameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. - threads: The
threadsparameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads - sort: The
sortparameter is a boolean flag that determines whether the output file should be sorted or not. Ifsortis set toTrue, the output file will be sorted based on the genomic coordinates of the variants. By default, the value ofsortisFalse, defaults to False - index: The
indexparameter is a boolean flag that determines whether an index should be created on the output file. Ifindexis True, an index will be created. Ifindexis False, no index will be created. The default value is False, defaults to False - order_by: The
order_byparameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns
a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.
2288 def get_extra_infos(self, table: str = None) -> list: 2289 """ 2290 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2291 in the header. 2292 2293 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2294 name of the table from which you want to retrieve the extra columns that are not present in the 2295 header. If the `table` parameter is not provided when calling the function, it will default to 2296 using the variants 2297 :type table: str 2298 :return: A list of columns that are in the specified table but not in the header of the table. 2299 """ 2300 2301 header_columns = [] 2302 2303 if not table: 2304 table = self.get_table_variants(clause="from") 2305 header_columns = self.get_header_columns() 2306 2307 # Check all columns in the database 2308 query = f""" SELECT * FROM {table} LIMIT 1 """ 2309 log.debug(f"query {query}") 2310 table_columns = self.get_query_to_df(query).columns.tolist() 2311 extra_columns = [] 2312 2313 # Construct extra infos (not in header) 2314 for column in table_columns: 2315 if column not in header_columns: 2316 extra_columns.append(column) 2317 2318 return extra_columns
The get_extra_infos function returns a list of columns that are in a specified table but not
in the header.
Parameters
- table: The
tableparameter in theget_extra_infosfunction is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If thetableparameter is not provided when calling the function, it will default to using the variants
Returns
A list of columns that are in the specified table but not in the header of the table.
2320 def get_extra_infos_sql(self, table: str = None) -> str: 2321 """ 2322 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2323 by double quotes 2324 2325 :param table: The name of the table to get the extra infos from. If None, the default table is 2326 used 2327 :type table: str 2328 :return: A string of the extra infos 2329 """ 2330 2331 return ", ".join( 2332 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2333 )
It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes
Parameters
- table: The name of the table to get the extra infos from. If None, the default table is used
Returns
A string of the extra infos
2335 def export_header( 2336 self, 2337 header_name: str = None, 2338 output_file: str = None, 2339 output_file_ext: str = ".hdr", 2340 clean_header: bool = True, 2341 remove_chrom_line: bool = False, 2342 ) -> str: 2343 """ 2344 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2345 specified options, and writes it to a new file. 2346 2347 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2348 this parameter is not specified, the header will be written to the output file 2349 :type header_name: str 2350 :param output_file: The `output_file` parameter in the `export_header` function is used to 2351 specify the name of the output file where the header will be written. If this parameter is not 2352 provided, the header will be written to a temporary file 2353 :type output_file: str 2354 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2355 string that represents the extension of the output header file. By default, it is set to ".hdr" 2356 if not specified by the user. This extension will be appended to the `output_file` name to 2357 create the final, defaults to .hdr 2358 :type output_file_ext: str (optional) 2359 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2360 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2361 `True`, the function will clean the header by modifying certain lines based on a specific 2362 pattern. If `clean_header`, defaults to True 2363 :type clean_header: bool (optional) 2364 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2365 boolean flag that determines whether the #CHROM line should be removed from the header before 2366 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2367 defaults to False 2368 :type remove_chrom_line: bool (optional) 2369 :return: The function `export_header` returns the name of the temporary header file that is 2370 created. 2371 """ 2372 2373 if not header_name and not output_file: 2374 output_file = self.get_output() 2375 2376 if self.get_header(): 2377 2378 # Get header object 2379 header_obj = self.get_header() 2380 2381 # Create database 2382 db_for_header = Database(database=self.get_input()) 2383 2384 # Get real columns in the file 2385 db_header_columns = db_for_header.get_columns() 2386 2387 with tempfile.TemporaryDirectory() as tmpdir: 2388 2389 # Write header file 2390 header_file_tmp = os.path.join(tmpdir, "header") 2391 f = open(header_file_tmp, "w") 2392 vcf.Writer(f, header_obj) 2393 f.close() 2394 2395 # Replace #CHROM line with rel columns 2396 header_list = db_for_header.read_header_file( 2397 header_file=header_file_tmp 2398 ) 2399 header_list[-1] = "\t".join(db_header_columns) 2400 2401 # Remove CHROM line 2402 if remove_chrom_line: 2403 header_list.pop() 2404 2405 # Clean header 2406 if clean_header: 2407 header_list_clean = [] 2408 for head in header_list: 2409 # Clean head for malformed header 2410 head_clean = head 2411 head_clean = re.subn( 2412 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2413 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2414 head_clean, 2415 2, 2416 )[0] 2417 # Write header 2418 header_list_clean.append(head_clean) 2419 header_list = header_list_clean 2420 2421 tmp_header_name = output_file + output_file_ext 2422 2423 f = open(tmp_header_name, "w") 2424 for line in header_list: 2425 f.write(line) 2426 f.close() 2427 2428 return tmp_header_name
The export_header function takes a VCF file, extracts the header, modifies it according to
specified options, and writes it to a new file.
Parameters
- header_name: The
header_nameparameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file - output_file: The
output_fileparameter in theexport_headerfunction is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file - output_file_ext: The
output_file_extparameter in theexport_headerfunction is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to theoutput_filename to create the final, defaults to .hdr - clean_header: The
clean_headerparameter in theexport_headerfunction is a boolean flag that determines whether the header should be cleaned or not. Whenclean_headeris set toTrue, the function will clean the header by modifying certain lines based on a specific pattern. Ifclean_header, defaults to True - remove_chrom_line: The
remove_chrom_lineparameter in theexport_headerfunction is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set toTrue, the #CHROM line will be removed; if set to `, defaults to False
Returns
The function
export_headerreturns the name of the temporary header file that is created.
2430 def export_variant_vcf( 2431 self, 2432 vcf_file, 2433 remove_info: bool = False, 2434 add_samples: bool = True, 2435 list_samples: list = [], 2436 where_clause: str = "", 2437 index: bool = False, 2438 threads: int | None = None, 2439 ) -> bool | None: 2440 """ 2441 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2442 remove INFO field, add samples, and control compression and indexing. 2443 2444 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2445 written to. It is the output file that will contain the filtered VCF data based on the specified 2446 parameters 2447 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2448 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2449 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2450 in, defaults to False 2451 :type remove_info: bool (optional) 2452 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2453 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2454 If set to False, the samples will be removed. The default value is True, defaults to True 2455 :type add_samples: bool (optional) 2456 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2457 in the output VCF file. By default, all samples will be included. If you provide a list of 2458 samples, only those samples will be included in the output file 2459 :type list_samples: list 2460 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2461 determines whether or not to create an index for the output VCF file. If `index` is set to 2462 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2463 :type index: bool (optional) 2464 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2465 number of threads to use for exporting the VCF file. It determines how many parallel threads 2466 will be used during the export process. More threads can potentially speed up the export process 2467 by utilizing multiple cores of the processor. If 2468 :type threads: int | None 2469 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2470 method with various parameters including the output file, query, threads, sort flag, and index 2471 flag. The `export_output` method is responsible for exporting the VCF data based on the 2472 specified parameters and configurations provided in the `export_variant_vcf` function. 2473 """ 2474 2475 # Config 2476 config = self.get_config() 2477 2478 # Extract VCF 2479 log.debug("Export VCF...") 2480 2481 # Table variants 2482 table_variants = self.get_table_variants() 2483 2484 # Threads 2485 if not threads: 2486 threads = self.get_threads() 2487 2488 # Info fields 2489 if remove_info: 2490 if not isinstance(remove_info, str): 2491 remove_info = "." 2492 info_field = f"""'{remove_info}' as INFO""" 2493 else: 2494 info_field = "INFO" 2495 2496 # Samples fields 2497 if add_samples: 2498 if not list_samples: 2499 list_samples = self.get_header_sample_list() 2500 if list_samples: 2501 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2502 else: 2503 samples_fields = "" 2504 log.debug(f"samples_fields: {samples_fields}") 2505 else: 2506 samples_fields = "" 2507 2508 # Where clause 2509 if where_clause is None: 2510 where_clause = "" 2511 2512 # Variants 2513 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2514 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2515 log.debug(f"sql_query_select={sql_query_select}") 2516 2517 return self.export_output( 2518 output_file=vcf_file, 2519 output_header=None, 2520 export_header=True, 2521 query=sql_query_select, 2522 parquet_partitions=None, 2523 chunk_size=config.get("chunk_size", None), 2524 threads=threads, 2525 sort=True, 2526 index=index, 2527 order_by=None, 2528 )
The export_variant_vcf function exports a VCF file with specified samples, allowing options to
remove INFO field, add samples, and control compression and indexing.
Parameters
- vcf_file: The
vcf_fileparameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters - remove_info: The
remove_infoparameter in theexport_variant_vcffunction is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set toTrue, the INFO field will be removed. If set toFalse, the INFO field will be included in, defaults to False - add_samples: The
add_samplesparameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True - list_samples: The
list_samplesparameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file - index: The
indexparameter in theexport_variant_vcffunction is a boolean flag that determines whether or not to create an index for the output VCF file. Ifindexis set toTrue, the output VCF file will be indexed using tabix. Ifindex, defaults to False - threads: The
threadsparameter in theexport_variant_vcffunction specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns
The
export_variant_vcffunction returns the result of calling theexport_outputmethod with various parameters including the output file, query, threads, sort flag, and index flag. Theexport_outputmethod is responsible for exporting the VCF data based on the specified parameters and configurations provided in theexport_variant_vcffunction.
2530 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2531 """ 2532 It takes a list of commands and runs them in parallel using the number of threads specified 2533 2534 :param commands: A list of commands to run 2535 :param threads: The number of threads to use, defaults to 1 (optional) 2536 """ 2537 2538 run_parallel_commands(commands, threads)
It takes a list of commands and runs them in parallel using the number of threads specified
Parameters
- commands: A list of commands to run
- threads: The number of threads to use, defaults to 1 (optional)
2540 def get_threads(self, default: int = 1) -> int: 2541 """ 2542 This function returns the number of threads to use for a job, with a default value of 1 if not 2543 specified. 2544 2545 :param default: The `default` parameter in the `get_threads` method is used to specify the 2546 default number of threads to use if no specific value is provided. If no value is provided for 2547 the `threads` parameter in the configuration or input parameters, the `default` value will be 2548 used, defaults to 1 2549 :type default: int (optional) 2550 :return: the number of threads to use for the current job. 2551 """ 2552 2553 # Config 2554 config = self.get_config() 2555 2556 # Param 2557 param = self.get_param() 2558 2559 # Input threads 2560 input_thread = param.get("threads", config.get("threads", None)) 2561 2562 # Check threads 2563 if not input_thread: 2564 threads = default 2565 elif int(input_thread) <= 0: 2566 threads = os.cpu_count() 2567 else: 2568 threads = int(input_thread) 2569 return threads
This function returns the number of threads to use for a job, with a default value of 1 if not specified.
Parameters
- default: The
defaultparameter in theget_threadsmethod is used to specify the default number of threads to use if no specific value is provided. If no value is provided for thethreadsparameter in the configuration or input parameters, thedefaultvalue will be used, defaults to 1
Returns
the number of threads to use for the current job.
2571 def get_memory(self, default: str = None) -> str: 2572 """ 2573 This function retrieves the memory value from parameters or configuration with a default value 2574 if not found. 2575 2576 :param default: The `get_memory` function takes in a default value as a string parameter. This 2577 default value is used as a fallback in case the `memory` parameter is not provided in the 2578 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2579 the function 2580 :type default: str 2581 :return: The `get_memory` function returns a string value representing the memory parameter. If 2582 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2583 return the default value provided as an argument to the function. 2584 """ 2585 2586 # Config 2587 config = self.get_config() 2588 2589 # Param 2590 param = self.get_param() 2591 2592 # Input threads 2593 input_memory = param.get("memory", config.get("memory", None)) 2594 2595 # Check threads 2596 if input_memory: 2597 memory = input_memory 2598 else: 2599 memory = default 2600 2601 return memory
This function retrieves the memory value from parameters or configuration with a default value if not found.
Parameters
- default: The
get_memoryfunction takes in a default value as a string parameter. This default value is used as a fallback in case thememoryparameter is not provided in theparamdictionary or theconfigdictionary. Ifmemoryis not found in either dictionary, the function
Returns
The
get_memoryfunction returns a string value representing the memory parameter. If theinput_memoryis provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.
2603 def update_from_vcf(self, vcf_file: str) -> None: 2604 """ 2605 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2606 2607 :param vcf_file: the path to the VCF file 2608 """ 2609 2610 connexion_format = self.get_connexion_format() 2611 2612 if connexion_format in ["duckdb"]: 2613 self.update_from_vcf_duckdb(vcf_file) 2614 elif connexion_format in ["sqlite"]: 2615 self.update_from_vcf_sqlite(vcf_file)
If the database is duckdb, then use the parquet method, otherwise use the sqlite method
Parameters
- vcf_file: the path to the VCF file
2617 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2618 """ 2619 It takes a VCF file and updates the INFO column of the variants table in the database with the 2620 INFO column of the VCF file 2621 2622 :param vcf_file: the path to the VCF file 2623 """ 2624 2625 # varaints table 2626 table_variants = self.get_table_variants() 2627 2628 # Loading VCF into temporaire table 2629 skip = self.get_header_length(file=vcf_file) 2630 vcf_df = pd.read_csv( 2631 vcf_file, 2632 sep="\t", 2633 engine="c", 2634 skiprows=skip, 2635 header=0, 2636 low_memory=False, 2637 ) 2638 sql_query_update = f""" 2639 UPDATE {table_variants} as table_variants 2640 SET INFO = concat( 2641 CASE 2642 WHEN INFO NOT IN ('', '.') 2643 THEN INFO 2644 ELSE '' 2645 END, 2646 ( 2647 SELECT 2648 concat( 2649 CASE 2650 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2651 THEN ';' 2652 ELSE '' 2653 END 2654 , 2655 CASE 2656 WHEN table_parquet.INFO NOT IN ('','.') 2657 THEN table_parquet.INFO 2658 ELSE '' 2659 END 2660 ) 2661 FROM vcf_df as table_parquet 2662 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2663 AND table_parquet.\"POS\" = table_variants.\"POS\" 2664 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2665 AND table_parquet.\"REF\" = table_variants.\"REF\" 2666 AND table_parquet.INFO NOT IN ('','.') 2667 ) 2668 ) 2669 ; 2670 """ 2671 self.conn.execute(sql_query_update)
It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file
Parameters
- vcf_file: the path to the VCF file
2673 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2674 """ 2675 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2676 table, then updates the INFO column of the variants table with the INFO column of the temporary 2677 table 2678 2679 :param vcf_file: The path to the VCF file you want to update the database with 2680 """ 2681 2682 # Create a temporary table for the VCF 2683 table_vcf = "tmp_vcf" 2684 sql_create = ( 2685 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2686 ) 2687 self.conn.execute(sql_create) 2688 2689 # Loading VCF into temporaire table 2690 vcf_df = pd.read_csv( 2691 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2692 ) 2693 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2694 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2695 2696 # Update table 'variants' with VCF data 2697 # warning: CONCAT as || operator 2698 sql_query_update = f""" 2699 UPDATE variants as table_variants 2700 SET INFO = CASE 2701 WHEN INFO NOT IN ('', '.') 2702 THEN INFO 2703 ELSE '' 2704 END || 2705 ( 2706 SELECT 2707 CASE 2708 WHEN table_variants.INFO NOT IN ('','.') 2709 AND table_vcf.INFO NOT IN ('','.') 2710 THEN ';' 2711 ELSE '' 2712 END || 2713 CASE 2714 WHEN table_vcf.INFO NOT IN ('','.') 2715 THEN table_vcf.INFO 2716 ELSE '' 2717 END 2718 FROM {table_vcf} as table_vcf 2719 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2720 AND table_vcf.\"POS\" = table_variants.\"POS\" 2721 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2722 AND table_vcf.\"REF\" = table_variants.\"REF\" 2723 ) 2724 """ 2725 self.conn.execute(sql_query_update) 2726 2727 # Drop temporary table 2728 sql_drop = f"DROP TABLE {table_vcf}" 2729 self.conn.execute(sql_drop)
It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table
Parameters
- vcf_file: The path to the VCF file you want to update the database with
2731 def drop_variants_table(self) -> None: 2732 """ 2733 > This function drops the variants table 2734 """ 2735 2736 table_variants = self.get_table_variants() 2737 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2738 self.conn.execute(sql_table_variants)
This function drops the variants table
2740 def set_variant_id( 2741 self, variant_id_column: str = "variant_id", force: bool = None 2742 ) -> str: 2743 """ 2744 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2745 `#CHROM`, `POS`, `REF`, and `ALT` columns 2746 2747 :param variant_id_column: The name of the column to be created in the variants table, defaults 2748 to variant_id 2749 :type variant_id_column: str (optional) 2750 :param force: If True, the variant_id column will be created even if it already exists 2751 :type force: bool 2752 :return: The name of the column that contains the variant_id 2753 """ 2754 2755 # Assembly 2756 assembly = self.get_param().get( 2757 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2758 ) 2759 2760 # INFO/Tag prefix 2761 prefix = self.get_explode_infos_prefix() 2762 2763 # Explode INFO/SVTYPE 2764 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2765 2766 # variants table 2767 table_variants = self.get_table_variants() 2768 2769 # variant_id column 2770 if not variant_id_column: 2771 variant_id_column = "variant_id" 2772 2773 # Creta variant_id column 2774 if "variant_id" not in self.get_extra_infos() or force: 2775 2776 # Create column 2777 self.add_column( 2778 table_name=table_variants, 2779 column_name=variant_id_column, 2780 column_type="UBIGINT", 2781 default_value="0", 2782 ) 2783 2784 # Update column 2785 self.conn.execute( 2786 f""" 2787 UPDATE {table_variants} 2788 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2789 """ 2790 ) 2791 2792 # Remove added columns 2793 for added_column in added_columns: 2794 self.drop_column(column=added_column) 2795 2796 # return variant_id column name 2797 return variant_id_column
It adds a column to the variants table called variant_id and populates it with a hash of the
#CHROM, POS, REF, and ALT columns
Parameters
- variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
- force: If True, the variant_id column will be created even if it already exists
Returns
The name of the column that contains the variant_id
2799 def get_variant_id_column( 2800 self, variant_id_column: str = "variant_id", force: bool = None 2801 ) -> str: 2802 """ 2803 This function returns the variant_id column name 2804 2805 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2806 defaults to variant_id 2807 :type variant_id_column: str (optional) 2808 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2809 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2810 if it is not already set, or if it is set 2811 :type force: bool 2812 :return: The variant_id column name. 2813 """ 2814 2815 return self.set_variant_id(variant_id_column=variant_id_column, force=force)
This function returns the variant_id column name
Parameters
- variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
- force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns
The variant_id column name.
2821 def scan_databases( 2822 self, 2823 database_formats: list = ["parquet"], 2824 database_releases: list = ["current"], 2825 ) -> dict: 2826 """ 2827 The function `scan_databases` scans for available databases based on specified formats and 2828 releases. 2829 2830 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2831 of the databases to be scanned. In this case, the accepted format is "parquet" 2832 :type database_formats: list ["parquet"] 2833 :param database_releases: The `database_releases` parameter is a list that specifies the 2834 releases of the databases to be scanned. In the provided function, the default value for 2835 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2836 databases that are in the "current" 2837 :type database_releases: list 2838 :return: The function `scan_databases` returns a dictionary containing information about 2839 databases that match the specified formats and releases. 2840 """ 2841 2842 # Config 2843 config = self.get_config() 2844 2845 # Param 2846 param = self.get_param() 2847 2848 # Param - Assembly 2849 assembly = param.get("assembly", config.get("assembly", None)) 2850 if not assembly: 2851 assembly = DEFAULT_ASSEMBLY 2852 log.warning(f"Default assembly '{assembly}'") 2853 2854 # Scan for availabled databases 2855 log.info( 2856 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2857 ) 2858 databases_infos_dict = databases_infos( 2859 database_folder_releases=database_releases, 2860 database_formats=database_formats, 2861 assembly=assembly, 2862 config=config, 2863 ) 2864 log.info( 2865 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2866 ) 2867 2868 return databases_infos_dict
The function scan_databases scans for available databases based on specified formats and
releases.
Parameters
- database_formats: The
database_formatsparameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet" - database_releases: The
database_releasesparameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value fordatabase_releasesis set to["current"], meaning that by default, the function will scan databases that are in the "current"
Returns
The function
scan_databasesreturns a dictionary containing information about databases that match the specified formats and releases.
2870 def annotation(self) -> None: 2871 """ 2872 It annotates the VCF file with the annotations specified in the config file. 2873 """ 2874 2875 # Config 2876 config = self.get_config() 2877 2878 # Param 2879 param = self.get_param() 2880 2881 # Param - Assembly 2882 assembly = param.get("assembly", config.get("assembly", None)) 2883 if not assembly: 2884 assembly = DEFAULT_ASSEMBLY 2885 log.warning(f"Default assembly '{assembly}'") 2886 2887 # annotations databases folders 2888 annotations_databases = set( 2889 config.get("folders", {}) 2890 .get("databases", {}) 2891 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2892 + config.get("folders", {}) 2893 .get("databases", {}) 2894 .get("parquet", ["~/howard/databases/parquet/current"]) 2895 + config.get("folders", {}) 2896 .get("databases", {}) 2897 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2898 ) 2899 2900 # Get param annotations 2901 if param.get("annotations", None) and isinstance( 2902 param.get("annotations", None), str 2903 ): 2904 log.debug(param.get("annotations", None)) 2905 param_annotation_list = param.get("annotations").split(",") 2906 else: 2907 param_annotation_list = [] 2908 2909 # Each tools param 2910 if param.get("annotation_parquet", None) != None: 2911 log.debug( 2912 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2913 ) 2914 if isinstance(param.get("annotation_parquet", None), list): 2915 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2916 else: 2917 param_annotation_list.append(param.get("annotation_parquet")) 2918 if param.get("annotation_snpsift", None) != None: 2919 if isinstance(param.get("annotation_snpsift", None), list): 2920 param_annotation_list.append( 2921 "snpsift:" 2922 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2923 ) 2924 else: 2925 param_annotation_list.append( 2926 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2927 ) 2928 if param.get("annotation_snpeff", None) != None: 2929 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2930 if param.get("annotation_bcftools", None) != None: 2931 if isinstance(param.get("annotation_bcftools", None), list): 2932 param_annotation_list.append( 2933 "bcftools:" 2934 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2935 ) 2936 else: 2937 param_annotation_list.append( 2938 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2939 ) 2940 if param.get("annotation_annovar", None) != None: 2941 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2942 if param.get("annotation_exomiser", None) != None: 2943 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2944 if param.get("annotation_splice", None) != None: 2945 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2946 2947 # Merge param annotations list 2948 param["annotations"] = ",".join(param_annotation_list) 2949 2950 # debug 2951 log.debug(f"param_annotations={param['annotations']}") 2952 2953 if param.get("annotations"): 2954 2955 # Log 2956 # log.info("Annotations - Check annotation parameters") 2957 2958 if not "annotation" in param: 2959 param["annotation"] = {} 2960 2961 # List of annotations parameters 2962 annotations_list_input = {} 2963 if isinstance(param.get("annotations", None), str): 2964 annotation_file_list = [ 2965 value for value in param.get("annotations", "").split(",") 2966 ] 2967 for annotation_file in annotation_file_list: 2968 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2969 else: 2970 annotations_list_input = param.get("annotations", {}) 2971 2972 log.info(f"Quick Annotations:") 2973 for annotation_key in list(annotations_list_input.keys()): 2974 log.info(f" {annotation_key}") 2975 2976 # List of annotations and associated fields 2977 annotations_list = {} 2978 2979 for annotation_file in annotations_list_input: 2980 2981 # Explode annotations if ALL 2982 if ( 2983 annotation_file.upper() == "ALL" 2984 or annotation_file.upper().startswith("ALL:") 2985 ): 2986 2987 # check ALL parameters (formats, releases) 2988 annotation_file_split = annotation_file.split(":") 2989 database_formats = "parquet" 2990 database_releases = "current" 2991 for annotation_file_option in annotation_file_split[1:]: 2992 database_all_options_split = annotation_file_option.split("=") 2993 if database_all_options_split[0] == "format": 2994 database_formats = database_all_options_split[1].split("+") 2995 if database_all_options_split[0] == "release": 2996 database_releases = database_all_options_split[1].split("+") 2997 2998 # Scan for availabled databases 2999 databases_infos_dict = self.scan_databases( 3000 database_formats=database_formats, 3001 database_releases=database_releases, 3002 ) 3003 3004 # Add found databases in annotation parameters 3005 for database_infos in databases_infos_dict.keys(): 3006 annotations_list[database_infos] = {"INFO": None} 3007 3008 else: 3009 annotations_list[annotation_file] = annotations_list_input[ 3010 annotation_file 3011 ] 3012 3013 # Check each databases 3014 if len(annotations_list): 3015 3016 log.info( 3017 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3018 ) 3019 3020 for annotation_file in annotations_list: 3021 3022 # Init 3023 annotations = annotations_list.get(annotation_file, None) 3024 3025 # Annotation snpEff 3026 if annotation_file.startswith("snpeff"): 3027 3028 log.debug(f"Quick Annotation snpEff") 3029 3030 if "snpeff" not in param["annotation"]: 3031 param["annotation"]["snpeff"] = {} 3032 3033 if "options" not in param["annotation"]["snpeff"]: 3034 param["annotation"]["snpeff"]["options"] = "" 3035 3036 # snpEff options in annotations 3037 param["annotation"]["snpeff"]["options"] = "".join( 3038 annotation_file.split(":")[1:] 3039 ) 3040 3041 # Annotation Annovar 3042 elif annotation_file.startswith("annovar"): 3043 3044 log.debug(f"Quick Annotation Annovar") 3045 3046 if "annovar" not in param["annotation"]: 3047 param["annotation"]["annovar"] = {} 3048 3049 if "annotations" not in param["annotation"]["annovar"]: 3050 param["annotation"]["annovar"]["annotations"] = {} 3051 3052 # Options 3053 annotation_file_split = annotation_file.split(":") 3054 for annotation_file_annotation in annotation_file_split[1:]: 3055 if annotation_file_annotation: 3056 param["annotation"]["annovar"]["annotations"][ 3057 annotation_file_annotation 3058 ] = annotations 3059 3060 # Annotation Exomiser 3061 elif annotation_file.startswith("exomiser"): 3062 3063 log.debug(f"Quick Annotation Exomiser") 3064 3065 param["annotation"]["exomiser"] = params_string_to_dict( 3066 annotation_file 3067 ) 3068 3069 # Annotation Splice 3070 elif annotation_file.startswith("splice"): 3071 3072 log.debug(f"Quick Annotation Splice") 3073 3074 param["annotation"]["splice"] = params_string_to_dict( 3075 annotation_file 3076 ) 3077 3078 # Annotation Parquet or BCFTOOLS 3079 else: 3080 3081 # Tools detection 3082 if annotation_file.startswith("bcftools:"): 3083 annotation_tool_initial = "bcftools" 3084 annotation_file = ":".join(annotation_file.split(":")[1:]) 3085 elif annotation_file.startswith("snpsift:"): 3086 annotation_tool_initial = "snpsift" 3087 annotation_file = ":".join(annotation_file.split(":")[1:]) 3088 elif annotation_file.startswith("bigwig:"): 3089 annotation_tool_initial = "bigwig" 3090 annotation_file = ":".join(annotation_file.split(":")[1:]) 3091 else: 3092 annotation_tool_initial = None 3093 3094 # list of files 3095 annotation_file_list = annotation_file.replace("+", ":").split( 3096 ":" 3097 ) 3098 3099 for annotation_file in annotation_file_list: 3100 3101 if annotation_file: 3102 3103 # Annotation tool initial 3104 annotation_tool = annotation_tool_initial 3105 3106 # Find file 3107 annotation_file_found = None 3108 3109 if os.path.exists(annotation_file): 3110 annotation_file_found = annotation_file 3111 elif os.path.exists(full_path(annotation_file)): 3112 annotation_file_found = full_path(annotation_file) 3113 else: 3114 # Find within assembly folders 3115 for annotations_database in annotations_databases: 3116 found_files = find_all( 3117 annotation_file, 3118 os.path.join( 3119 annotations_database, assembly 3120 ), 3121 ) 3122 if len(found_files) > 0: 3123 annotation_file_found = found_files[0] 3124 break 3125 if not annotation_file_found and not assembly: 3126 # Find within folders 3127 for ( 3128 annotations_database 3129 ) in annotations_databases: 3130 found_files = find_all( 3131 annotation_file, annotations_database 3132 ) 3133 if len(found_files) > 0: 3134 annotation_file_found = found_files[0] 3135 break 3136 log.debug( 3137 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3138 ) 3139 3140 # Full path 3141 annotation_file_found = full_path(annotation_file_found) 3142 3143 if annotation_file_found: 3144 3145 database = Database(database=annotation_file_found) 3146 quick_annotation_format = database.get_format() 3147 quick_annotation_is_compressed = ( 3148 database.is_compressed() 3149 ) 3150 quick_annotation_is_indexed = os.path.exists( 3151 f"{annotation_file_found}.tbi" 3152 ) 3153 bcftools_preference = False 3154 3155 # Check Annotation Tool 3156 if not annotation_tool: 3157 if ( 3158 bcftools_preference 3159 and quick_annotation_format 3160 in ["vcf", "bed"] 3161 and quick_annotation_is_compressed 3162 and quick_annotation_is_indexed 3163 ): 3164 annotation_tool = "bcftools" 3165 elif quick_annotation_format in [ 3166 "vcf", 3167 "bed", 3168 "tsv", 3169 "tsv", 3170 "csv", 3171 "json", 3172 "tbl", 3173 "parquet", 3174 "duckdb", 3175 ]: 3176 annotation_tool = "parquet" 3177 elif quick_annotation_format in ["bw"]: 3178 annotation_tool = "bigwig" 3179 else: 3180 log.error( 3181 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3182 ) 3183 raise ValueError( 3184 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3185 ) 3186 3187 log.debug( 3188 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3189 ) 3190 3191 # Annotation Tool dispatch 3192 if annotation_tool: 3193 if annotation_tool not in param["annotation"]: 3194 param["annotation"][annotation_tool] = {} 3195 if ( 3196 "annotations" 3197 not in param["annotation"][annotation_tool] 3198 ): 3199 param["annotation"][annotation_tool][ 3200 "annotations" 3201 ] = {} 3202 param["annotation"][annotation_tool][ 3203 "annotations" 3204 ][annotation_file_found] = annotations 3205 3206 else: 3207 log.warning( 3208 f"Quick Annotation File {annotation_file} does NOT exist" 3209 ) 3210 3211 self.set_param(param) 3212 3213 if param.get("annotation", None): 3214 log.info("Annotations") 3215 if param.get("annotation", {}).get("parquet", None): 3216 log.info("Annotations 'parquet'...") 3217 self.annotation_parquet() 3218 if param.get("annotation", {}).get("bcftools", None): 3219 log.info("Annotations 'bcftools'...") 3220 self.annotation_bcftools() 3221 if param.get("annotation", {}).get("snpsift", None): 3222 log.info("Annotations 'snpsift'...") 3223 self.annotation_snpsift() 3224 if param.get("annotation", {}).get("bigwig", None): 3225 log.info("Annotations 'bigwig'...") 3226 self.annotation_bigwig() 3227 if param.get("annotation", {}).get("annovar", None): 3228 log.info("Annotations 'annovar'...") 3229 self.annotation_annovar() 3230 if param.get("annotation", {}).get("snpeff", None): 3231 log.info("Annotations 'snpeff'...") 3232 self.annotation_snpeff() 3233 if param.get("annotation", {}).get("exomiser", None) is not None: 3234 log.info("Annotations 'exomiser'...") 3235 self.annotation_exomiser() 3236 if param.get("annotation", {}).get("splice", None) is not None: 3237 log.info("Annotations 'splice' ...") 3238 self.annotation_splice() 3239 3240 # Explode INFOS fields into table fields 3241 if self.get_explode_infos(): 3242 self.explode_infos( 3243 prefix=self.get_explode_infos_prefix(), 3244 fields=self.get_explode_infos_fields(), 3245 force=True, 3246 )
It annotates the VCF file with the annotations specified in the config file.
3248 def annotation_bigwig(self, threads: int = None) -> None: 3249 """ 3250 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3251 3252 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3253 number of threads to be used for parallel processing during the annotation process. If the 3254 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3255 threads to use based on the system configuration 3256 :type threads: int 3257 :return: True 3258 """ 3259 3260 # DEBUG 3261 log.debug("Start annotation with bigwig databases") 3262 3263 # # Threads 3264 # if not threads: 3265 # threads = self.get_threads() 3266 # log.debug("Threads: " + str(threads)) 3267 3268 # Config 3269 config = self.get_config() 3270 log.debug("Config: " + str(config)) 3271 3272 # Config - BCFTools databases folders 3273 databases_folders = set( 3274 self.get_config() 3275 .get("folders", {}) 3276 .get("databases", {}) 3277 .get("annotations", ["."]) 3278 + self.get_config() 3279 .get("folders", {}) 3280 .get("databases", {}) 3281 .get("bigwig", ["."]) 3282 ) 3283 log.debug("Databases annotations: " + str(databases_folders)) 3284 3285 # Param 3286 annotations = ( 3287 self.get_param() 3288 .get("annotation", {}) 3289 .get("bigwig", {}) 3290 .get("annotations", None) 3291 ) 3292 log.debug("Annotations: " + str(annotations)) 3293 3294 # Assembly 3295 assembly = self.get_param().get( 3296 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3297 ) 3298 3299 # Data 3300 table_variants = self.get_table_variants() 3301 3302 # Check if not empty 3303 log.debug("Check if not empty") 3304 sql_query_chromosomes = ( 3305 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3306 ) 3307 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3308 if not sql_query_chromosomes_df["count"][0]: 3309 log.info(f"VCF empty") 3310 return 3311 3312 # VCF header 3313 vcf_reader = self.get_header() 3314 log.debug("Initial header: " + str(vcf_reader.infos)) 3315 3316 # Existing annotations 3317 for vcf_annotation in self.get_header().infos: 3318 3319 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3320 log.debug( 3321 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3322 ) 3323 3324 if annotations: 3325 3326 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3327 3328 # Export VCF file 3329 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3330 3331 # annotation_bigwig_config 3332 annotation_bigwig_config_list = [] 3333 3334 for annotation in annotations: 3335 annotation_fields = annotations[annotation] 3336 3337 # Annotation Name 3338 annotation_name = os.path.basename(annotation) 3339 3340 if not annotation_fields: 3341 annotation_fields = {"INFO": None} 3342 3343 log.debug(f"Annotation '{annotation_name}'") 3344 log.debug( 3345 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3346 ) 3347 3348 # Create Database 3349 database = Database( 3350 database=annotation, 3351 databases_folders=databases_folders, 3352 assembly=assembly, 3353 ) 3354 3355 # Find files 3356 db_file = database.get_database() 3357 db_file = full_path(db_file) 3358 db_hdr_file = database.get_header_file() 3359 db_hdr_file = full_path(db_hdr_file) 3360 db_file_type = database.get_format() 3361 3362 # If db_file is http ? 3363 if database.get_database().startswith("http"): 3364 3365 # Datbase is HTTP URL 3366 db_file_is_http = True 3367 3368 # DB file keep as URL 3369 db_file = database.get_database() 3370 log.warning( 3371 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3372 ) 3373 3374 # Retrieve automatic annotation field name 3375 annotation_field = clean_annotation_field( 3376 os.path.basename(db_file).replace(".bw", "") 3377 ) 3378 log.debug( 3379 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3380 ) 3381 3382 # Create automatic header file 3383 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3384 with open(db_hdr_file, "w") as f: 3385 f.write("##fileformat=VCFv4.2\n") 3386 f.write( 3387 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3388 ) 3389 f.write(f"#CHROM START END {annotation_field}\n") 3390 3391 else: 3392 3393 # Datbase is NOT HTTP URL 3394 db_file_is_http = False 3395 3396 # Check index - try to create if not exists 3397 if ( 3398 db_file is None 3399 or db_hdr_file is None 3400 or (not os.path.exists(db_file) and not db_file_is_http) 3401 or not os.path.exists(db_hdr_file) 3402 or not db_file_type in ["bw"] 3403 ): 3404 # if False: 3405 log.error("Annotation failed: database not valid") 3406 log.error(f"Annotation annotation file: {db_file}") 3407 log.error(f"Annotation annotation file type: {db_file_type}") 3408 log.error(f"Annotation annotation header: {db_hdr_file}") 3409 raise ValueError( 3410 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3411 ) 3412 else: 3413 3414 # Log 3415 log.debug( 3416 f"Annotation '{annotation}' - file: " 3417 + str(db_file) 3418 + " and " 3419 + str(db_hdr_file) 3420 ) 3421 3422 # Load header as VCF object 3423 db_hdr_vcf = Variants(input=db_hdr_file) 3424 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3425 log.debug( 3426 "Annotation database header: " 3427 + str(db_hdr_vcf_header_infos) 3428 ) 3429 3430 # For all fields in database 3431 annotation_fields_full = False 3432 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3433 annotation_fields = { 3434 key: key for key in db_hdr_vcf_header_infos 3435 } 3436 log.debug( 3437 "Annotation database header - All annotations added: " 3438 + str(annotation_fields) 3439 ) 3440 annotation_fields_full = True 3441 3442 # Init 3443 cyvcf2_header_rename_dict = {} 3444 cyvcf2_header_list = [] 3445 cyvcf2_header_indexes = {} 3446 3447 # process annotation fields 3448 for annotation_field in annotation_fields: 3449 3450 # New annotation name 3451 annotation_field_new = annotation_fields[annotation_field] 3452 3453 # Check annotation field and index in header 3454 if ( 3455 annotation_field 3456 in db_hdr_vcf.get_header_columns_as_list() 3457 ): 3458 annotation_field_index = ( 3459 db_hdr_vcf.get_header_columns_as_list().index( 3460 annotation_field 3461 ) 3462 - 3 3463 ) 3464 cyvcf2_header_indexes[annotation_field_new] = ( 3465 annotation_field_index 3466 ) 3467 else: 3468 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3469 log.error(msg_err) 3470 raise ValueError(msg_err) 3471 3472 # Append annotation field in cyvcf2 header list 3473 cyvcf2_header_rename_dict[annotation_field_new] = ( 3474 db_hdr_vcf_header_infos[annotation_field].id 3475 ) 3476 cyvcf2_header_list.append( 3477 { 3478 "ID": annotation_field_new, 3479 "Number": db_hdr_vcf_header_infos[ 3480 annotation_field 3481 ].num, 3482 "Type": db_hdr_vcf_header_infos[ 3483 annotation_field 3484 ].type, 3485 "Description": db_hdr_vcf_header_infos[ 3486 annotation_field 3487 ].desc, 3488 } 3489 ) 3490 3491 # Add header on VCF 3492 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3493 annotation_field_new, 3494 db_hdr_vcf_header_infos[annotation_field].num, 3495 db_hdr_vcf_header_infos[annotation_field].type, 3496 db_hdr_vcf_header_infos[annotation_field].desc, 3497 "HOWARD BigWig annotation", 3498 "unknown", 3499 self.code_type_map[ 3500 db_hdr_vcf_header_infos[annotation_field].type 3501 ], 3502 ) 3503 3504 # Load bigwig database 3505 bw_db = pyBigWig.open(db_file) 3506 if bw_db.isBigWig(): 3507 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3508 else: 3509 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3510 log.error(msg_err) 3511 raise ValueError(msg_err) 3512 3513 annotation_bigwig_config_list.append( 3514 { 3515 "db_file": db_file, 3516 "bw_db": bw_db, 3517 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3518 "cyvcf2_header_list": cyvcf2_header_list, 3519 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3520 } 3521 ) 3522 3523 # Annotate 3524 if annotation_bigwig_config_list: 3525 3526 # Annotation config 3527 log.debug( 3528 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3529 ) 3530 3531 # Export VCF file 3532 self.export_variant_vcf( 3533 vcf_file=tmp_vcf_name, 3534 remove_info=True, 3535 add_samples=False, 3536 index=True, 3537 ) 3538 3539 # Load input tmp file 3540 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3541 3542 # Add header in input file 3543 for annotation_bigwig_config in annotation_bigwig_config_list: 3544 for cyvcf2_header_field in annotation_bigwig_config.get( 3545 "cyvcf2_header_list", [] 3546 ): 3547 log.info( 3548 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3549 ) 3550 input_vcf.add_info_to_header(cyvcf2_header_field) 3551 3552 # Create output VCF file 3553 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3554 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3555 3556 # Fetch variants 3557 log.info(f"Annotations 'bigwig' start...") 3558 for variant in input_vcf: 3559 3560 for annotation_bigwig_config in annotation_bigwig_config_list: 3561 3562 # DB and indexes 3563 bw_db = annotation_bigwig_config.get("bw_db", None) 3564 cyvcf2_header_indexes = annotation_bigwig_config.get( 3565 "cyvcf2_header_indexes", None 3566 ) 3567 3568 # Retrieve value from chrom pos 3569 res = bw_db.values( 3570 variant.CHROM, variant.POS - 1, variant.POS 3571 ) 3572 3573 # For each annotation fields (and indexes) 3574 for cyvcf2_header_index in cyvcf2_header_indexes: 3575 3576 # If value is NOT nNone 3577 if not np.isnan( 3578 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3579 ): 3580 variant.INFO[cyvcf2_header_index] = res[ 3581 cyvcf2_header_indexes[cyvcf2_header_index] 3582 ] 3583 3584 # Add record in output file 3585 output_vcf.write_record(variant) 3586 3587 # Log 3588 log.debug(f"Annotation done.") 3589 3590 # Close and write file 3591 log.info(f"Annotations 'bigwig' write...") 3592 output_vcf.close() 3593 log.debug(f"Write done.") 3594 3595 # Update variants 3596 log.info(f"Annotations 'bigwig' update...") 3597 self.update_from_vcf(output_vcf_file) 3598 log.debug(f"Update done.") 3599 3600 return True
The function annotation_bigwig annotates variants in a VCF file using bigwig databases.
Parameters
- threads: The
threadsparameter in theannotation_bigwigmethod is used to specify the number of threads to be used for parallel processing during the annotation process. If thethreadsparameter is not provided, the method will attempt to determine the optimal number of threads to use based on the system configuration
Returns
True
3602 def annotation_snpsift(self, threads: int = None) -> None: 3603 """ 3604 This function annotate with bcftools 3605 3606 :param threads: Number of threads to use 3607 :return: the value of the variable "return_value". 3608 """ 3609 3610 # DEBUG 3611 log.debug("Start annotation with bcftools databases") 3612 3613 # Threads 3614 if not threads: 3615 threads = self.get_threads() 3616 log.debug("Threads: " + str(threads)) 3617 3618 # Config 3619 config = self.get_config() 3620 log.debug("Config: " + str(config)) 3621 3622 # Config - snpSift 3623 snpsift_bin_command = get_bin_command( 3624 bin="SnpSift.jar", 3625 tool="snpsift", 3626 bin_type="jar", 3627 config=config, 3628 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3629 ) 3630 if not snpsift_bin_command: 3631 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3632 log.error(msg_err) 3633 raise ValueError(msg_err) 3634 3635 # Config - bcftools 3636 bcftools_bin_command = get_bin_command( 3637 bin="bcftools", 3638 tool="bcftools", 3639 bin_type="bin", 3640 config=config, 3641 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3642 ) 3643 if not bcftools_bin_command: 3644 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3645 log.error(msg_err) 3646 raise ValueError(msg_err) 3647 3648 # Config - BCFTools databases folders 3649 databases_folders = set( 3650 self.get_config() 3651 .get("folders", {}) 3652 .get("databases", {}) 3653 .get("annotations", ["."]) 3654 + self.get_config() 3655 .get("folders", {}) 3656 .get("databases", {}) 3657 .get("bcftools", ["."]) 3658 ) 3659 log.debug("Databases annotations: " + str(databases_folders)) 3660 3661 # Param 3662 annotations = ( 3663 self.get_param() 3664 .get("annotation", {}) 3665 .get("snpsift", {}) 3666 .get("annotations", None) 3667 ) 3668 log.debug("Annotations: " + str(annotations)) 3669 3670 # Assembly 3671 assembly = self.get_param().get( 3672 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3673 ) 3674 3675 # Data 3676 table_variants = self.get_table_variants() 3677 3678 # Check if not empty 3679 log.debug("Check if not empty") 3680 sql_query_chromosomes = ( 3681 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3682 ) 3683 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3684 if not sql_query_chromosomes_df["count"][0]: 3685 log.info(f"VCF empty") 3686 return 3687 3688 # VCF header 3689 vcf_reader = self.get_header() 3690 log.debug("Initial header: " + str(vcf_reader.infos)) 3691 3692 # Existing annotations 3693 for vcf_annotation in self.get_header().infos: 3694 3695 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3696 log.debug( 3697 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3698 ) 3699 3700 if annotations: 3701 3702 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3703 3704 # Export VCF file 3705 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3706 3707 # Init 3708 commands = {} 3709 3710 for annotation in annotations: 3711 annotation_fields = annotations[annotation] 3712 3713 # Annotation Name 3714 annotation_name = os.path.basename(annotation) 3715 3716 if not annotation_fields: 3717 annotation_fields = {"INFO": None} 3718 3719 log.debug(f"Annotation '{annotation_name}'") 3720 log.debug( 3721 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3722 ) 3723 3724 # Create Database 3725 database = Database( 3726 database=annotation, 3727 databases_folders=databases_folders, 3728 assembly=assembly, 3729 ) 3730 3731 # Find files 3732 db_file = database.get_database() 3733 db_file = full_path(db_file) 3734 db_hdr_file = database.get_header_file() 3735 db_hdr_file = full_path(db_hdr_file) 3736 db_file_type = database.get_format() 3737 db_tbi_file = f"{db_file}.tbi" 3738 db_file_compressed = database.is_compressed() 3739 3740 # Check if compressed 3741 if not db_file_compressed: 3742 log.error( 3743 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3744 ) 3745 raise ValueError( 3746 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3747 ) 3748 3749 # Check if indexed 3750 if not os.path.exists(db_tbi_file): 3751 log.error( 3752 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3753 ) 3754 raise ValueError( 3755 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3756 ) 3757 3758 # Check index - try to create if not exists 3759 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3760 log.error("Annotation failed: database not valid") 3761 log.error(f"Annotation annotation file: {db_file}") 3762 log.error(f"Annotation annotation header: {db_hdr_file}") 3763 log.error(f"Annotation annotation index: {db_tbi_file}") 3764 raise ValueError( 3765 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3766 ) 3767 else: 3768 3769 log.debug( 3770 f"Annotation '{annotation}' - file: " 3771 + str(db_file) 3772 + " and " 3773 + str(db_hdr_file) 3774 ) 3775 3776 # Load header as VCF object 3777 db_hdr_vcf = Variants(input=db_hdr_file) 3778 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3779 log.debug( 3780 "Annotation database header: " 3781 + str(db_hdr_vcf_header_infos) 3782 ) 3783 3784 # For all fields in database 3785 annotation_fields_full = False 3786 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3787 annotation_fields = { 3788 key: key for key in db_hdr_vcf_header_infos 3789 } 3790 log.debug( 3791 "Annotation database header - All annotations added: " 3792 + str(annotation_fields) 3793 ) 3794 annotation_fields_full = True 3795 3796 # # Create file for field rename 3797 # log.debug("Create file for field rename") 3798 # tmp_rename = NamedTemporaryFile( 3799 # prefix=self.get_prefix(), 3800 # dir=self.get_tmp_dir(), 3801 # suffix=".rename", 3802 # delete=False, 3803 # ) 3804 # tmp_rename_name = tmp_rename.name 3805 # tmp_files.append(tmp_rename_name) 3806 3807 # Number of fields 3808 nb_annotation_field = 0 3809 annotation_list = [] 3810 annotation_infos_rename_list = [] 3811 3812 for annotation_field in annotation_fields: 3813 3814 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3815 annotation_fields_new_name = annotation_fields.get( 3816 annotation_field, annotation_field 3817 ) 3818 if not annotation_fields_new_name: 3819 annotation_fields_new_name = annotation_field 3820 3821 # Check if field is in DB and if field is not elready in input data 3822 if ( 3823 annotation_field in db_hdr_vcf.get_header().infos 3824 and annotation_fields_new_name 3825 not in self.get_header().infos 3826 ): 3827 3828 log.info( 3829 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3830 ) 3831 3832 # BCFTools annotate param to rename fields 3833 if annotation_field != annotation_fields_new_name: 3834 annotation_infos_rename_list.append( 3835 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3836 ) 3837 3838 # Add INFO field to header 3839 db_hdr_vcf_header_infos_number = ( 3840 db_hdr_vcf_header_infos[annotation_field].num or "." 3841 ) 3842 db_hdr_vcf_header_infos_type = ( 3843 db_hdr_vcf_header_infos[annotation_field].type 3844 or "String" 3845 ) 3846 db_hdr_vcf_header_infos_description = ( 3847 db_hdr_vcf_header_infos[annotation_field].desc 3848 or f"{annotation_field} description" 3849 ) 3850 db_hdr_vcf_header_infos_source = ( 3851 db_hdr_vcf_header_infos[annotation_field].source 3852 or "unknown" 3853 ) 3854 db_hdr_vcf_header_infos_version = ( 3855 db_hdr_vcf_header_infos[annotation_field].version 3856 or "unknown" 3857 ) 3858 3859 vcf_reader.infos[annotation_fields_new_name] = ( 3860 vcf.parser._Info( 3861 annotation_fields_new_name, 3862 db_hdr_vcf_header_infos_number, 3863 db_hdr_vcf_header_infos_type, 3864 db_hdr_vcf_header_infos_description, 3865 db_hdr_vcf_header_infos_source, 3866 db_hdr_vcf_header_infos_version, 3867 self.code_type_map[ 3868 db_hdr_vcf_header_infos_type 3869 ], 3870 ) 3871 ) 3872 3873 annotation_list.append(annotation_field) 3874 3875 nb_annotation_field += 1 3876 3877 else: 3878 3879 if ( 3880 annotation_field 3881 not in db_hdr_vcf.get_header().infos 3882 ): 3883 log.warning( 3884 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3885 ) 3886 if ( 3887 annotation_fields_new_name 3888 in self.get_header().infos 3889 ): 3890 log.warning( 3891 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3892 ) 3893 3894 log.info( 3895 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3896 ) 3897 3898 annotation_infos = ",".join(annotation_list) 3899 3900 if annotation_infos != "": 3901 3902 # Annotated VCF (and error file) 3903 tmp_annotation_vcf_name = os.path.join( 3904 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3905 ) 3906 tmp_annotation_vcf_name_err = ( 3907 tmp_annotation_vcf_name + ".err" 3908 ) 3909 3910 # Add fields to annotate 3911 if not annotation_fields_full: 3912 annotation_infos_option = f"-info {annotation_infos}" 3913 else: 3914 annotation_infos_option = "" 3915 3916 # Info fields rename 3917 if annotation_infos_rename_list: 3918 annotation_infos_rename = " -c " + ",".join( 3919 annotation_infos_rename_list 3920 ) 3921 else: 3922 annotation_infos_rename = "" 3923 3924 # Annotate command 3925 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3926 3927 # Add command 3928 commands[command_annotate] = tmp_annotation_vcf_name 3929 3930 if commands: 3931 3932 # Export VCF file 3933 self.export_variant_vcf( 3934 vcf_file=tmp_vcf_name, 3935 remove_info=True, 3936 add_samples=False, 3937 index=True, 3938 ) 3939 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3940 3941 # Num command 3942 nb_command = 0 3943 3944 # Annotate 3945 for command_annotate in commands: 3946 nb_command += 1 3947 log.info( 3948 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3949 ) 3950 log.debug(f"command_annotate={command_annotate}") 3951 run_parallel_commands([command_annotate], threads) 3952 3953 # Debug 3954 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3955 3956 # Update variants 3957 log.info( 3958 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3959 ) 3960 self.update_from_vcf(commands[command_annotate])
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3962 def annotation_bcftools(self, threads: int = None) -> None: 3963 """ 3964 This function annotate with bcftools 3965 3966 :param threads: Number of threads to use 3967 :return: the value of the variable "return_value". 3968 """ 3969 3970 # DEBUG 3971 log.debug("Start annotation with bcftools databases") 3972 3973 # Threads 3974 if not threads: 3975 threads = self.get_threads() 3976 log.debug("Threads: " + str(threads)) 3977 3978 # Config 3979 config = self.get_config() 3980 log.debug("Config: " + str(config)) 3981 3982 # DEBUG 3983 delete_tmp = True 3984 if self.get_config().get("verbosity", "warning") in ["debug"]: 3985 delete_tmp = False 3986 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3987 3988 # Config - BCFTools bin command 3989 bcftools_bin_command = get_bin_command( 3990 bin="bcftools", 3991 tool="bcftools", 3992 bin_type="bin", 3993 config=config, 3994 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3995 ) 3996 if not bcftools_bin_command: 3997 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3998 log.error(msg_err) 3999 raise ValueError(msg_err) 4000 4001 # Config - BCFTools databases folders 4002 databases_folders = set( 4003 self.get_config() 4004 .get("folders", {}) 4005 .get("databases", {}) 4006 .get("annotations", ["."]) 4007 + self.get_config() 4008 .get("folders", {}) 4009 .get("databases", {}) 4010 .get("bcftools", ["."]) 4011 ) 4012 log.debug("Databases annotations: " + str(databases_folders)) 4013 4014 # Param 4015 annotations = ( 4016 self.get_param() 4017 .get("annotation", {}) 4018 .get("bcftools", {}) 4019 .get("annotations", None) 4020 ) 4021 log.debug("Annotations: " + str(annotations)) 4022 4023 # Assembly 4024 assembly = self.get_param().get( 4025 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4026 ) 4027 4028 # Data 4029 table_variants = self.get_table_variants() 4030 4031 # Check if not empty 4032 log.debug("Check if not empty") 4033 sql_query_chromosomes = ( 4034 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4035 ) 4036 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4037 if not sql_query_chromosomes_df["count"][0]: 4038 log.info(f"VCF empty") 4039 return 4040 4041 # Export in VCF 4042 log.debug("Create initial file to annotate") 4043 tmp_vcf = NamedTemporaryFile( 4044 prefix=self.get_prefix(), 4045 dir=self.get_tmp_dir(), 4046 suffix=".vcf.gz", 4047 delete=False, 4048 ) 4049 tmp_vcf_name = tmp_vcf.name 4050 4051 # VCF header 4052 vcf_reader = self.get_header() 4053 log.debug("Initial header: " + str(vcf_reader.infos)) 4054 4055 # Existing annotations 4056 for vcf_annotation in self.get_header().infos: 4057 4058 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4059 log.debug( 4060 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4061 ) 4062 4063 if annotations: 4064 4065 tmp_ann_vcf_list = [] 4066 commands = [] 4067 tmp_files = [] 4068 err_files = [] 4069 4070 for annotation in annotations: 4071 annotation_fields = annotations[annotation] 4072 4073 # Annotation Name 4074 annotation_name = os.path.basename(annotation) 4075 4076 if not annotation_fields: 4077 annotation_fields = {"INFO": None} 4078 4079 log.debug(f"Annotation '{annotation_name}'") 4080 log.debug( 4081 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4082 ) 4083 4084 # Create Database 4085 database = Database( 4086 database=annotation, 4087 databases_folders=databases_folders, 4088 assembly=assembly, 4089 ) 4090 4091 # Find files 4092 db_file = database.get_database() 4093 db_file = full_path(db_file) 4094 db_hdr_file = database.get_header_file() 4095 db_hdr_file = full_path(db_hdr_file) 4096 db_file_type = database.get_format() 4097 db_tbi_file = f"{db_file}.tbi" 4098 db_file_compressed = database.is_compressed() 4099 4100 # Check if compressed 4101 if not db_file_compressed: 4102 log.error( 4103 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4104 ) 4105 raise ValueError( 4106 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4107 ) 4108 4109 # Check if indexed 4110 if not os.path.exists(db_tbi_file): 4111 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4112 raise ValueError( 4113 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4114 ) 4115 4116 # Check index - try to create if not exists 4117 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4118 log.error("Annotation failed: database not valid") 4119 log.error(f"Annotation annotation file: {db_file}") 4120 log.error(f"Annotation annotation header: {db_hdr_file}") 4121 log.error(f"Annotation annotation index: {db_tbi_file}") 4122 raise ValueError( 4123 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4124 ) 4125 else: 4126 4127 log.debug( 4128 f"Annotation '{annotation}' - file: " 4129 + str(db_file) 4130 + " and " 4131 + str(db_hdr_file) 4132 ) 4133 4134 # Load header as VCF object 4135 db_hdr_vcf = Variants(input=db_hdr_file) 4136 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4137 log.debug( 4138 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4139 ) 4140 4141 # For all fields in database 4142 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4143 annotation_fields = { 4144 key: key for key in db_hdr_vcf_header_infos 4145 } 4146 log.debug( 4147 "Annotation database header - All annotations added: " 4148 + str(annotation_fields) 4149 ) 4150 4151 # Number of fields 4152 nb_annotation_field = 0 4153 annotation_list = [] 4154 4155 for annotation_field in annotation_fields: 4156 4157 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4158 annotation_fields_new_name = annotation_fields.get( 4159 annotation_field, annotation_field 4160 ) 4161 if not annotation_fields_new_name: 4162 annotation_fields_new_name = annotation_field 4163 4164 # Check if field is in DB and if field is not elready in input data 4165 if ( 4166 annotation_field in db_hdr_vcf.get_header().infos 4167 and annotation_fields_new_name 4168 not in self.get_header().infos 4169 ): 4170 4171 log.info( 4172 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4173 ) 4174 4175 # Add INFO field to header 4176 db_hdr_vcf_header_infos_number = ( 4177 db_hdr_vcf_header_infos[annotation_field].num or "." 4178 ) 4179 db_hdr_vcf_header_infos_type = ( 4180 db_hdr_vcf_header_infos[annotation_field].type 4181 or "String" 4182 ) 4183 db_hdr_vcf_header_infos_description = ( 4184 db_hdr_vcf_header_infos[annotation_field].desc 4185 or f"{annotation_field} description" 4186 ) 4187 db_hdr_vcf_header_infos_source = ( 4188 db_hdr_vcf_header_infos[annotation_field].source 4189 or "unknown" 4190 ) 4191 db_hdr_vcf_header_infos_version = ( 4192 db_hdr_vcf_header_infos[annotation_field].version 4193 or "unknown" 4194 ) 4195 4196 vcf_reader.infos[annotation_fields_new_name] = ( 4197 vcf.parser._Info( 4198 annotation_fields_new_name, 4199 db_hdr_vcf_header_infos_number, 4200 db_hdr_vcf_header_infos_type, 4201 db_hdr_vcf_header_infos_description, 4202 db_hdr_vcf_header_infos_source, 4203 db_hdr_vcf_header_infos_version, 4204 self.code_type_map[db_hdr_vcf_header_infos_type], 4205 ) 4206 ) 4207 4208 # annotation_list.append(annotation_field) 4209 if annotation_field != annotation_fields_new_name: 4210 annotation_list.append( 4211 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4212 ) 4213 else: 4214 annotation_list.append(annotation_field) 4215 4216 nb_annotation_field += 1 4217 4218 else: 4219 4220 if annotation_field not in db_hdr_vcf.get_header().infos: 4221 log.warning( 4222 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4223 ) 4224 if annotation_fields_new_name in self.get_header().infos: 4225 log.warning( 4226 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4227 ) 4228 4229 log.info( 4230 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4231 ) 4232 4233 annotation_infos = ",".join(annotation_list) 4234 4235 if annotation_infos != "": 4236 4237 # Protect header for bcftools (remove "#CHROM" and variants line) 4238 log.debug("Protect Header file - remove #CHROM line if exists") 4239 tmp_header_vcf = NamedTemporaryFile( 4240 prefix=self.get_prefix(), 4241 dir=self.get_tmp_dir(), 4242 suffix=".hdr", 4243 delete=False, 4244 ) 4245 tmp_header_vcf_name = tmp_header_vcf.name 4246 tmp_files.append(tmp_header_vcf_name) 4247 # Command 4248 if db_hdr_file.endswith(".gz"): 4249 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4250 else: 4251 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4252 # Run 4253 run_parallel_commands([command_extract_header], 1) 4254 4255 # Find chomosomes 4256 log.debug("Find chromosomes ") 4257 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4258 sql_query_chromosomes_df = self.get_query_to_df( 4259 sql_query_chromosomes 4260 ) 4261 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4262 4263 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4264 4265 # BED columns in the annotation file 4266 if db_file_type in ["bed"]: 4267 annotation_infos = "CHROM,POS,POS," + annotation_infos 4268 4269 for chrom in chomosomes_list: 4270 4271 # Create BED on initial VCF 4272 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4273 tmp_bed = NamedTemporaryFile( 4274 prefix=self.get_prefix(), 4275 dir=self.get_tmp_dir(), 4276 suffix=".bed", 4277 delete=False, 4278 ) 4279 tmp_bed_name = tmp_bed.name 4280 tmp_files.append(tmp_bed_name) 4281 4282 # Detecte regions 4283 log.debug( 4284 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4285 ) 4286 window = 1000000 4287 sql_query_intervals_for_bed = f""" 4288 SELECT \"#CHROM\", 4289 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4290 \"POS\"+{window} 4291 FROM {table_variants} as table_variants 4292 WHERE table_variants.\"#CHROM\" = '{chrom}' 4293 """ 4294 regions = self.conn.execute( 4295 sql_query_intervals_for_bed 4296 ).fetchall() 4297 merged_regions = merge_regions(regions) 4298 log.debug( 4299 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4300 ) 4301 4302 header = ["#CHROM", "START", "END"] 4303 with open(tmp_bed_name, "w") as f: 4304 # Write the header with tab delimiter 4305 f.write("\t".join(header) + "\n") 4306 for d in merged_regions: 4307 # Write each data row with tab delimiter 4308 f.write("\t".join(map(str, d)) + "\n") 4309 4310 # Tmp files 4311 tmp_annotation_vcf = NamedTemporaryFile( 4312 prefix=self.get_prefix(), 4313 dir=self.get_tmp_dir(), 4314 suffix=".vcf.gz", 4315 delete=False, 4316 ) 4317 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4318 tmp_files.append(tmp_annotation_vcf_name) 4319 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4320 tmp_annotation_vcf_name_err = ( 4321 tmp_annotation_vcf_name + ".err" 4322 ) 4323 err_files.append(tmp_annotation_vcf_name_err) 4324 4325 # Annotate Command 4326 log.debug( 4327 f"Annotation '{annotation}' - add bcftools command" 4328 ) 4329 4330 # Command 4331 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4332 4333 # Add command 4334 commands.append(command_annotate) 4335 4336 # if some commands 4337 if commands: 4338 4339 # Export VCF file 4340 self.export_variant_vcf( 4341 vcf_file=tmp_vcf_name, 4342 remove_info=True, 4343 add_samples=False, 4344 index=True, 4345 ) 4346 4347 # Threads 4348 # calculate threads for annotated commands 4349 if commands: 4350 threads_bcftools_annotate = round(threads / len(commands)) 4351 else: 4352 threads_bcftools_annotate = 1 4353 4354 if not threads_bcftools_annotate: 4355 threads_bcftools_annotate = 1 4356 4357 # Add threads option to bcftools commands 4358 if threads_bcftools_annotate > 1: 4359 commands_threaded = [] 4360 for command in commands: 4361 commands_threaded.append( 4362 command.replace( 4363 f"{bcftools_bin_command} annotate ", 4364 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4365 ) 4366 ) 4367 commands = commands_threaded 4368 4369 # Command annotation multithreading 4370 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4371 log.info( 4372 f"Annotation - Annotation multithreaded in " 4373 + str(len(commands)) 4374 + " commands" 4375 ) 4376 4377 run_parallel_commands(commands, threads) 4378 4379 # Merge 4380 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4381 4382 if tmp_ann_vcf_list_cmd: 4383 4384 # Tmp file 4385 tmp_annotate_vcf = NamedTemporaryFile( 4386 prefix=self.get_prefix(), 4387 dir=self.get_tmp_dir(), 4388 suffix=".vcf.gz", 4389 delete=True, 4390 ) 4391 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4392 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4393 err_files.append(tmp_annotate_vcf_name_err) 4394 4395 # Tmp file remove command 4396 tmp_files_remove_command = "" 4397 if tmp_files: 4398 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4399 4400 # Command merge 4401 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4402 log.info( 4403 f"Annotation - Annotation merging " 4404 + str(len(commands)) 4405 + " annotated files" 4406 ) 4407 log.debug(f"Annotation - merge command: {merge_command}") 4408 run_parallel_commands([merge_command], 1) 4409 4410 # Error messages 4411 log.info(f"Error/Warning messages:") 4412 error_message_command_all = [] 4413 error_message_command_warning = [] 4414 error_message_command_err = [] 4415 for err_file in err_files: 4416 with open(err_file, "r") as f: 4417 for line in f: 4418 message = line.strip() 4419 error_message_command_all.append(message) 4420 if line.startswith("[W::"): 4421 error_message_command_warning.append(message) 4422 if line.startswith("[E::"): 4423 error_message_command_err.append( 4424 f"{err_file}: " + message 4425 ) 4426 # log info 4427 for message in list( 4428 set(error_message_command_err + error_message_command_warning) 4429 ): 4430 log.info(f" {message}") 4431 # debug info 4432 for message in list(set(error_message_command_all)): 4433 log.debug(f" {message}") 4434 # failed 4435 if len(error_message_command_err): 4436 log.error("Annotation failed: Error in commands") 4437 raise ValueError("Annotation failed: Error in commands") 4438 4439 # Update variants 4440 log.info(f"Annotation - Updating...") 4441 self.update_from_vcf(tmp_annotate_vcf_name)
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
4443 def annotation_exomiser(self, threads: int = None) -> None: 4444 """ 4445 This function annotate with Exomiser 4446 4447 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4448 - "analysis" (dict/file): 4449 Full analysis dictionnary parameters (see Exomiser docs). 4450 Either a dict, or a file in JSON or YAML format. 4451 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4452 Default : None 4453 - "preset" (string): 4454 Analysis preset (available in config folder). 4455 Used if no full "analysis" is provided. 4456 Default: "exome" 4457 - "phenopacket" (dict/file): 4458 Samples and phenotipic features parameters (see Exomiser docs). 4459 Either a dict, or a file in JSON or YAML format. 4460 Default: None 4461 - "subject" (dict): 4462 Sample parameters (see Exomiser docs). 4463 Example: 4464 "subject": 4465 { 4466 "id": "ISDBM322017", 4467 "sex": "FEMALE" 4468 } 4469 Default: None 4470 - "sample" (string): 4471 Sample name to construct "subject" section: 4472 "subject": 4473 { 4474 "id": "<sample>", 4475 "sex": "UNKNOWN_SEX" 4476 } 4477 Default: None 4478 - "phenotypicFeatures" (dict) 4479 Phenotypic features to construct "subject" section. 4480 Example: 4481 "phenotypicFeatures": 4482 [ 4483 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4484 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4485 ] 4486 - "hpo" (list) 4487 List of HPO ids as phenotypic features. 4488 Example: 4489 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4490 Default: [] 4491 - "outputOptions" (dict): 4492 Output options (see Exomiser docs). 4493 Default: 4494 "output_options" = 4495 { 4496 "outputContributingVariantsOnly": False, 4497 "numGenes": 0, 4498 "outputFormats": ["TSV_VARIANT", "VCF"] 4499 } 4500 - "transcript_source" (string): 4501 Transcript source (either "refseq", "ucsc", "ensembl") 4502 Default: "refseq" 4503 - "exomiser_to_info" (boolean): 4504 Add exomiser TSV file columns as INFO fields in VCF. 4505 Default: False 4506 - "release" (string): 4507 Exomise database release. 4508 If not exists, database release will be downloaded (take a while). 4509 Default: None (provided by application.properties configuration file) 4510 - "exomiser_application_properties" (file): 4511 Exomiser configuration file (see Exomiser docs). 4512 Useful to automatically download databases (especially for specific genome databases). 4513 4514 Notes: 4515 - If no sample in parameters, first sample in VCF will be chosen 4516 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4517 4518 :param threads: The number of threads to use 4519 :return: None. 4520 """ 4521 4522 # DEBUG 4523 log.debug("Start annotation with Exomiser databases") 4524 4525 # Threads 4526 if not threads: 4527 threads = self.get_threads() 4528 log.debug("Threads: " + str(threads)) 4529 4530 # Config 4531 config = self.get_config() 4532 log.debug("Config: " + str(config)) 4533 4534 # Config - Folders - Databases 4535 databases_folders = ( 4536 config.get("folders", {}) 4537 .get("databases", {}) 4538 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4539 ) 4540 databases_folders = full_path(databases_folders) 4541 if not os.path.exists(databases_folders): 4542 log.error(f"Databases annotations: {databases_folders} NOT found") 4543 log.debug("Databases annotations: " + str(databases_folders)) 4544 4545 # Config - Exomiser 4546 exomiser_bin_command = get_bin_command( 4547 bin="exomiser-cli*.jar", 4548 tool="exomiser", 4549 bin_type="jar", 4550 config=config, 4551 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4552 ) 4553 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4554 if not exomiser_bin_command: 4555 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4556 log.error(msg_err) 4557 raise ValueError(msg_err) 4558 4559 # Param 4560 param = self.get_param() 4561 log.debug("Param: " + str(param)) 4562 4563 # Param - Exomiser 4564 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4565 log.debug(f"Param Exomiser: {param_exomiser}") 4566 4567 # Param - Assembly 4568 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4569 log.debug("Assembly: " + str(assembly)) 4570 4571 # Data 4572 table_variants = self.get_table_variants() 4573 4574 # Check if not empty 4575 log.debug("Check if not empty") 4576 sql_query_chromosomes = ( 4577 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4578 ) 4579 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4580 log.info(f"VCF empty") 4581 return False 4582 4583 # VCF header 4584 vcf_reader = self.get_header() 4585 log.debug("Initial header: " + str(vcf_reader.infos)) 4586 4587 # Samples 4588 samples = self.get_header_sample_list() 4589 if not samples: 4590 log.error("No Samples in VCF") 4591 return False 4592 log.debug(f"Samples: {samples}") 4593 4594 # Memory limit 4595 memory_limit = self.get_memory("8G") 4596 log.debug(f"memory_limit: {memory_limit}") 4597 4598 # Exomiser java options 4599 exomiser_java_options = ( 4600 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4601 ) 4602 log.debug(f"Exomiser java options: {exomiser_java_options}") 4603 4604 # Download Exomiser (if not exists) 4605 exomiser_release = param_exomiser.get("release", None) 4606 exomiser_application_properties = param_exomiser.get( 4607 "exomiser_application_properties", None 4608 ) 4609 databases_download_exomiser( 4610 assemblies=[assembly], 4611 exomiser_folder=databases_folders, 4612 exomiser_release=exomiser_release, 4613 exomiser_phenotype_release=exomiser_release, 4614 exomiser_application_properties=exomiser_application_properties, 4615 ) 4616 4617 # Force annotation 4618 force_update_annotation = True 4619 4620 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4621 log.debug("Start annotation Exomiser") 4622 4623 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4624 4625 # tmp_dir = "/tmp/exomiser" 4626 4627 ### ANALYSIS ### 4628 ################ 4629 4630 # Create analysis.json through analysis dict 4631 # either analysis in param or by default 4632 # depending on preset exome/genome) 4633 4634 # Init analysis dict 4635 param_exomiser_analysis_dict = {} 4636 4637 # analysis from param 4638 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4639 param_exomiser_analysis = full_path(param_exomiser_analysis) 4640 4641 # If analysis in param -> load anlaysis json 4642 if param_exomiser_analysis: 4643 4644 # If param analysis is a file and exists 4645 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4646 param_exomiser_analysis 4647 ): 4648 # Load analysis file into analysis dict (either yaml or json) 4649 with open(param_exomiser_analysis) as json_file: 4650 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4651 4652 # If param analysis is a dict 4653 elif isinstance(param_exomiser_analysis, dict): 4654 # Load analysis dict into analysis dict (either yaml or json) 4655 param_exomiser_analysis_dict = param_exomiser_analysis 4656 4657 # Error analysis type 4658 else: 4659 log.error(f"Analysis type unknown. Check param file.") 4660 raise ValueError(f"Analysis type unknown. Check param file.") 4661 4662 # Case no input analysis config file/dict 4663 # Use preset (exome/genome) to open default config file 4664 if not param_exomiser_analysis_dict: 4665 4666 # default preset 4667 default_preset = "exome" 4668 4669 # Get param preset or default preset 4670 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4671 4672 # Try to find if preset is a file 4673 if os.path.exists(param_exomiser_preset): 4674 # Preset file is provided in full path 4675 param_exomiser_analysis_default_config_file = ( 4676 param_exomiser_preset 4677 ) 4678 # elif os.path.exists(full_path(param_exomiser_preset)): 4679 # # Preset file is provided in full path 4680 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4681 elif os.path.exists( 4682 os.path.join(folder_config, param_exomiser_preset) 4683 ): 4684 # Preset file is provided a basename in config folder (can be a path with subfolders) 4685 param_exomiser_analysis_default_config_file = os.path.join( 4686 folder_config, param_exomiser_preset 4687 ) 4688 else: 4689 # Construct preset file 4690 param_exomiser_analysis_default_config_file = os.path.join( 4691 folder_config, 4692 f"preset-{param_exomiser_preset}-analysis.json", 4693 ) 4694 4695 # If preset file exists 4696 param_exomiser_analysis_default_config_file = full_path( 4697 param_exomiser_analysis_default_config_file 4698 ) 4699 if os.path.exists(param_exomiser_analysis_default_config_file): 4700 # Load prest file into analysis dict (either yaml or json) 4701 with open( 4702 param_exomiser_analysis_default_config_file 4703 ) as json_file: 4704 # param_exomiser_analysis_dict[""] = json.load(json_file) 4705 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4706 json_file 4707 ) 4708 4709 # Error preset file 4710 else: 4711 log.error( 4712 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4713 ) 4714 raise ValueError( 4715 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4716 ) 4717 4718 # If no analysis dict created 4719 if not param_exomiser_analysis_dict: 4720 log.error(f"No analysis config") 4721 raise ValueError(f"No analysis config") 4722 4723 # Log 4724 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4725 4726 ### PHENOPACKET ### 4727 ################### 4728 4729 # If no PhenoPacket in analysis dict -> check in param 4730 if "phenopacket" not in param_exomiser_analysis_dict: 4731 4732 # If PhenoPacket in param -> load anlaysis json 4733 if param_exomiser.get("phenopacket", None): 4734 4735 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4736 param_exomiser_phenopacket = full_path( 4737 param_exomiser_phenopacket 4738 ) 4739 4740 # If param phenopacket is a file and exists 4741 if isinstance( 4742 param_exomiser_phenopacket, str 4743 ) and os.path.exists(param_exomiser_phenopacket): 4744 # Load phenopacket file into analysis dict (either yaml or json) 4745 with open(param_exomiser_phenopacket) as json_file: 4746 param_exomiser_analysis_dict["phenopacket"] = ( 4747 yaml.safe_load(json_file) 4748 ) 4749 4750 # If param phenopacket is a dict 4751 elif isinstance(param_exomiser_phenopacket, dict): 4752 # Load phenopacket dict into analysis dict (either yaml or json) 4753 param_exomiser_analysis_dict["phenopacket"] = ( 4754 param_exomiser_phenopacket 4755 ) 4756 4757 # Error phenopacket type 4758 else: 4759 log.error(f"Phenopacket type unknown. Check param file.") 4760 raise ValueError( 4761 f"Phenopacket type unknown. Check param file." 4762 ) 4763 4764 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4765 if "phenopacket" not in param_exomiser_analysis_dict: 4766 4767 # Init PhenoPacket 4768 param_exomiser_analysis_dict["phenopacket"] = { 4769 "id": "analysis", 4770 "proband": {}, 4771 } 4772 4773 ### Add subject ### 4774 4775 # If subject exists 4776 param_exomiser_subject = param_exomiser.get("subject", {}) 4777 4778 # If subject not exists -> found sample ID 4779 if not param_exomiser_subject: 4780 4781 # Found sample ID in param 4782 sample = param_exomiser.get("sample", None) 4783 4784 # Find sample ID (first sample) 4785 if not sample: 4786 sample_list = self.get_header_sample_list() 4787 if len(sample_list) > 0: 4788 sample = sample_list[0] 4789 else: 4790 log.error(f"No sample found") 4791 raise ValueError(f"No sample found") 4792 4793 # Create subject 4794 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4795 4796 # Add to dict 4797 param_exomiser_analysis_dict["phenopacket"][ 4798 "subject" 4799 ] = param_exomiser_subject 4800 4801 ### Add "phenotypicFeatures" ### 4802 4803 # If phenotypicFeatures exists 4804 param_exomiser_phenotypicfeatures = param_exomiser.get( 4805 "phenotypicFeatures", [] 4806 ) 4807 4808 # If phenotypicFeatures not exists -> Try to infer from hpo list 4809 if not param_exomiser_phenotypicfeatures: 4810 4811 # Found HPO in param 4812 param_exomiser_hpo = param_exomiser.get("hpo", []) 4813 4814 # Split HPO if list in string format separated by comma 4815 if isinstance(param_exomiser_hpo, str): 4816 param_exomiser_hpo = param_exomiser_hpo.split(",") 4817 4818 # Create HPO list 4819 for hpo in param_exomiser_hpo: 4820 hpo_clean = re.sub("[^0-9]", "", hpo) 4821 param_exomiser_phenotypicfeatures.append( 4822 { 4823 "type": { 4824 "id": f"HP:{hpo_clean}", 4825 "label": f"HP:{hpo_clean}", 4826 } 4827 } 4828 ) 4829 4830 # Add to dict 4831 param_exomiser_analysis_dict["phenopacket"][ 4832 "phenotypicFeatures" 4833 ] = param_exomiser_phenotypicfeatures 4834 4835 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4836 if not param_exomiser_phenotypicfeatures: 4837 for step in param_exomiser_analysis_dict.get( 4838 "analysis", {} 4839 ).get("steps", []): 4840 if "hiPhivePrioritiser" in step: 4841 param_exomiser_analysis_dict.get("analysis", {}).get( 4842 "steps", [] 4843 ).remove(step) 4844 4845 ### Add Input File ### 4846 4847 # Initial file name and htsFiles 4848 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4849 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4850 { 4851 "uri": tmp_vcf_name, 4852 "htsFormat": "VCF", 4853 "genomeAssembly": assembly, 4854 } 4855 ] 4856 4857 ### Add metaData ### 4858 4859 # If metaData not in analysis dict 4860 if "metaData" not in param_exomiser_analysis_dict: 4861 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4862 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4863 "createdBy": "howard", 4864 "phenopacketSchemaVersion": 1, 4865 } 4866 4867 ### OutputOptions ### 4868 4869 # Init output result folder 4870 output_results = os.path.join(tmp_dir, "results") 4871 4872 # If no outputOptions in analysis dict 4873 if "outputOptions" not in param_exomiser_analysis_dict: 4874 4875 # default output formats 4876 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4877 4878 # Get outputOptions in param 4879 output_options = param_exomiser.get("outputOptions", None) 4880 4881 # If no output_options in param -> check 4882 if not output_options: 4883 output_options = { 4884 "outputContributingVariantsOnly": False, 4885 "numGenes": 0, 4886 "outputFormats": defaut_output_formats, 4887 } 4888 4889 # Replace outputDirectory in output options 4890 output_options["outputDirectory"] = output_results 4891 output_options["outputFileName"] = "howard" 4892 4893 # Add outputOptions in analysis dict 4894 param_exomiser_analysis_dict["outputOptions"] = output_options 4895 4896 else: 4897 4898 # Replace output_results and output format (if exists in param) 4899 param_exomiser_analysis_dict["outputOptions"][ 4900 "outputDirectory" 4901 ] = output_results 4902 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4903 list( 4904 set( 4905 param_exomiser_analysis_dict.get( 4906 "outputOptions", {} 4907 ).get("outputFormats", []) 4908 + ["TSV_VARIANT", "VCF"] 4909 ) 4910 ) 4911 ) 4912 4913 # log 4914 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4915 4916 ### ANALYSIS FILE ### 4917 ##################### 4918 4919 ### Full JSON analysis config file ### 4920 4921 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4922 with open(exomiser_analysis, "w") as fp: 4923 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4924 4925 ### SPLIT analysis and sample config files 4926 4927 # Splitted analysis dict 4928 param_exomiser_analysis_dict_for_split = ( 4929 param_exomiser_analysis_dict.copy() 4930 ) 4931 4932 # Phenopacket JSON file 4933 exomiser_analysis_phenopacket = os.path.join( 4934 tmp_dir, "analysis_phenopacket.json" 4935 ) 4936 with open(exomiser_analysis_phenopacket, "w") as fp: 4937 json.dump( 4938 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4939 fp, 4940 indent=4, 4941 ) 4942 4943 # Analysis JSON file without Phenopacket parameters 4944 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4945 exomiser_analysis_analysis = os.path.join( 4946 tmp_dir, "analysis_analysis.json" 4947 ) 4948 with open(exomiser_analysis_analysis, "w") as fp: 4949 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4950 4951 ### INITAL VCF file ### 4952 ####################### 4953 4954 ### Create list of samples to use and include inti initial VCF file #### 4955 4956 # Subject (main sample) 4957 # Get sample ID in analysis dict 4958 sample_subject = ( 4959 param_exomiser_analysis_dict.get("phenopacket", {}) 4960 .get("subject", {}) 4961 .get("id", None) 4962 ) 4963 sample_proband = ( 4964 param_exomiser_analysis_dict.get("phenopacket", {}) 4965 .get("proband", {}) 4966 .get("subject", {}) 4967 .get("id", None) 4968 ) 4969 sample = [] 4970 if sample_subject: 4971 sample.append(sample_subject) 4972 if sample_proband: 4973 sample.append(sample_proband) 4974 4975 # Get sample ID within Pedigree 4976 pedigree_persons_list = ( 4977 param_exomiser_analysis_dict.get("phenopacket", {}) 4978 .get("pedigree", {}) 4979 .get("persons", {}) 4980 ) 4981 4982 # Create list with all sample ID in pedigree (if exists) 4983 pedigree_persons = [] 4984 for person in pedigree_persons_list: 4985 pedigree_persons.append(person.get("individualId")) 4986 4987 # Concat subject sample ID and samples ID in pedigreesamples 4988 samples = list(set(sample + pedigree_persons)) 4989 4990 # Check if sample list is not empty 4991 if not samples: 4992 log.error(f"No samples found") 4993 raise ValueError(f"No samples found") 4994 4995 # Create VCF with sample (either sample in param or first one by default) 4996 # Export VCF file 4997 self.export_variant_vcf( 4998 vcf_file=tmp_vcf_name, 4999 remove_info=True, 5000 add_samples=True, 5001 list_samples=samples, 5002 index=False, 5003 ) 5004 5005 ### Execute Exomiser ### 5006 ######################## 5007 5008 # Init command 5009 exomiser_command = "" 5010 5011 # Command exomiser options 5012 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5013 5014 # Release 5015 exomiser_release = param_exomiser.get("release", None) 5016 if exomiser_release: 5017 # phenotype data version 5018 exomiser_options += ( 5019 f" --exomiser.phenotype.data-version={exomiser_release} " 5020 ) 5021 # data version 5022 exomiser_options += ( 5023 f" --exomiser.{assembly}.data-version={exomiser_release} " 5024 ) 5025 # variant white list 5026 variant_white_list_file = ( 5027 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5028 ) 5029 if os.path.exists( 5030 os.path.join( 5031 databases_folders, assembly, variant_white_list_file 5032 ) 5033 ): 5034 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5035 5036 # transcript_source 5037 transcript_source = param_exomiser.get( 5038 "transcript_source", None 5039 ) # ucsc, refseq, ensembl 5040 if transcript_source: 5041 exomiser_options += ( 5042 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5043 ) 5044 5045 # If analysis contain proband param 5046 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5047 "proband", {} 5048 ): 5049 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5050 5051 # If no proband (usually uniq sample) 5052 else: 5053 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5054 5055 # Log 5056 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5057 5058 # Run command 5059 result = subprocess.call( 5060 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5061 ) 5062 if result: 5063 log.error("Exomiser command failed") 5064 raise ValueError("Exomiser command failed") 5065 5066 ### RESULTS ### 5067 ############### 5068 5069 ### Annotate with TSV fields ### 5070 5071 # Init result tsv file 5072 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5073 5074 # Init result tsv file 5075 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5076 5077 # Parse TSV file and explode columns in INFO field 5078 if exomiser_to_info and os.path.exists(output_results_tsv): 5079 5080 # Log 5081 log.debug("Exomiser columns to VCF INFO field") 5082 5083 # Retrieve columns and types 5084 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5085 output_results_tsv_df = self.get_query_to_df(query) 5086 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5087 5088 # Init concat fields for update 5089 sql_query_update_concat_fields = [] 5090 5091 # Fields to avoid 5092 fields_to_avoid = [ 5093 "CONTIG", 5094 "START", 5095 "END", 5096 "REF", 5097 "ALT", 5098 "QUAL", 5099 "FILTER", 5100 "GENOTYPE", 5101 ] 5102 5103 # List all columns to add into header 5104 for header_column in output_results_tsv_columns: 5105 5106 # If header column is enable 5107 if header_column not in fields_to_avoid: 5108 5109 # Header info type 5110 header_info_type = "String" 5111 header_column_df = output_results_tsv_df[header_column] 5112 header_column_df_dtype = header_column_df.dtype 5113 if header_column_df_dtype == object: 5114 if ( 5115 pd.to_numeric(header_column_df, errors="coerce") 5116 .notnull() 5117 .all() 5118 ): 5119 header_info_type = "Float" 5120 else: 5121 header_info_type = "Integer" 5122 5123 # Header info 5124 characters_to_validate = ["-"] 5125 pattern = "[" + "".join(characters_to_validate) + "]" 5126 header_info_name = re.sub( 5127 pattern, 5128 "_", 5129 f"Exomiser_{header_column}".replace("#", ""), 5130 ) 5131 header_info_number = "." 5132 header_info_description = ( 5133 f"Exomiser {header_column} annotation" 5134 ) 5135 header_info_source = "Exomiser" 5136 header_info_version = "unknown" 5137 header_info_code = CODE_TYPE_MAP[header_info_type] 5138 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5139 header_info_name, 5140 header_info_number, 5141 header_info_type, 5142 header_info_description, 5143 header_info_source, 5144 header_info_version, 5145 header_info_code, 5146 ) 5147 5148 # Add field to add for update to concat fields 5149 sql_query_update_concat_fields.append( 5150 f""" 5151 CASE 5152 WHEN table_parquet."{header_column}" NOT IN ('','.') 5153 THEN concat( 5154 '{header_info_name}=', 5155 table_parquet."{header_column}", 5156 ';' 5157 ) 5158 5159 ELSE '' 5160 END 5161 """ 5162 ) 5163 5164 # Update query 5165 sql_query_update = f""" 5166 UPDATE {table_variants} as table_variants 5167 SET INFO = concat( 5168 CASE 5169 WHEN INFO NOT IN ('', '.') 5170 THEN INFO 5171 ELSE '' 5172 END, 5173 CASE 5174 WHEN table_variants.INFO NOT IN ('','.') 5175 THEN ';' 5176 ELSE '' 5177 END, 5178 ( 5179 SELECT 5180 concat( 5181 {",".join(sql_query_update_concat_fields)} 5182 ) 5183 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5184 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5185 AND table_parquet.\"START\" = table_variants.\"POS\" 5186 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5187 AND table_parquet.\"REF\" = table_variants.\"REF\" 5188 ) 5189 ) 5190 ; 5191 """ 5192 5193 # Update 5194 self.conn.execute(sql_query_update) 5195 5196 ### Annotate with VCF INFO field ### 5197 5198 # Init result VCF file 5199 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5200 5201 # If VCF exists 5202 if os.path.exists(output_results_vcf): 5203 5204 # Log 5205 log.debug("Exomiser result VCF update variants") 5206 5207 # Find Exomiser INFO field annotation in header 5208 with gzip.open(output_results_vcf, "rt") as f: 5209 header_list = self.read_vcf_header(f) 5210 exomiser_vcf_header = vcf.Reader( 5211 io.StringIO("\n".join(header_list)) 5212 ) 5213 5214 # Add annotation INFO field to header 5215 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5216 5217 # Update variants with VCF 5218 self.update_from_vcf(output_results_vcf) 5219 5220 return True
This function annotate with Exomiser
This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
- "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
- "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
- "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
- "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
- "sample" (string):
Sample name to construct "subject" section:
"subject":
{
"id": "
", "sex": "UNKNOWN_SEX" } Default: None - "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
- "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
- "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
- "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
- "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
- "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
- "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).
Notes:
- If no sample in parameters, first sample in VCF will be chosen
- If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
- threads: The number of threads to use
Returns
None.
5222 def annotation_snpeff(self, threads: int = None) -> None: 5223 """ 5224 This function annotate with snpEff 5225 5226 :param threads: The number of threads to use 5227 :return: the value of the variable "return_value". 5228 """ 5229 5230 # DEBUG 5231 log.debug("Start annotation with snpeff databases") 5232 5233 # Threads 5234 if not threads: 5235 threads = self.get_threads() 5236 log.debug("Threads: " + str(threads)) 5237 5238 # DEBUG 5239 delete_tmp = True 5240 if self.get_config().get("verbosity", "warning") in ["debug"]: 5241 delete_tmp = False 5242 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5243 5244 # Config 5245 config = self.get_config() 5246 log.debug("Config: " + str(config)) 5247 5248 # Config - Folders - Databases 5249 databases_folders = ( 5250 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5251 ) 5252 log.debug("Databases annotations: " + str(databases_folders)) 5253 5254 # Config - snpEff bin command 5255 snpeff_bin_command = get_bin_command( 5256 bin="snpEff.jar", 5257 tool="snpeff", 5258 bin_type="jar", 5259 config=config, 5260 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5261 ) 5262 if not snpeff_bin_command: 5263 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5264 log.error(msg_err) 5265 raise ValueError(msg_err) 5266 5267 # Config - snpEff databases 5268 snpeff_databases = ( 5269 config.get("folders", {}) 5270 .get("databases", {}) 5271 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5272 ) 5273 snpeff_databases = full_path(snpeff_databases) 5274 if snpeff_databases is not None and snpeff_databases != "": 5275 log.debug(f"Create snpEff databases folder") 5276 if not os.path.exists(snpeff_databases): 5277 os.makedirs(snpeff_databases) 5278 5279 # Param 5280 param = self.get_param() 5281 log.debug("Param: " + str(param)) 5282 5283 # Param 5284 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5285 log.debug("Options: " + str(options)) 5286 5287 # Param - Assembly 5288 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5289 5290 # Param - Options 5291 snpeff_options = ( 5292 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5293 ) 5294 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5295 snpeff_csvstats = ( 5296 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5297 ) 5298 if snpeff_stats: 5299 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5300 snpeff_stats = full_path(snpeff_stats) 5301 snpeff_options += f" -stats {snpeff_stats}" 5302 if snpeff_csvstats: 5303 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5304 snpeff_csvstats = full_path(snpeff_csvstats) 5305 snpeff_options += f" -csvStats {snpeff_csvstats}" 5306 5307 # Data 5308 table_variants = self.get_table_variants() 5309 5310 # Check if not empty 5311 log.debug("Check if not empty") 5312 sql_query_chromosomes = ( 5313 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5314 ) 5315 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5316 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5317 log.info(f"VCF empty") 5318 return 5319 5320 # Export in VCF 5321 log.debug("Create initial file to annotate") 5322 tmp_vcf = NamedTemporaryFile( 5323 prefix=self.get_prefix(), 5324 dir=self.get_tmp_dir(), 5325 suffix=".vcf.gz", 5326 delete=True, 5327 ) 5328 tmp_vcf_name = tmp_vcf.name 5329 5330 # VCF header 5331 vcf_reader = self.get_header() 5332 log.debug("Initial header: " + str(vcf_reader.infos)) 5333 5334 # Existing annotations 5335 for vcf_annotation in self.get_header().infos: 5336 5337 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5338 log.debug( 5339 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5340 ) 5341 5342 # Memory limit 5343 # if config.get("memory", None): 5344 # memory_limit = config.get("memory", "8G") 5345 # else: 5346 # memory_limit = "8G" 5347 memory_limit = self.get_memory("8G") 5348 log.debug(f"memory_limit: {memory_limit}") 5349 5350 # snpEff java options 5351 snpeff_java_options = ( 5352 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5353 ) 5354 log.debug(f"Exomiser java options: {snpeff_java_options}") 5355 5356 force_update_annotation = True 5357 5358 if "ANN" not in self.get_header().infos or force_update_annotation: 5359 5360 # Check snpEff database 5361 log.debug(f"Check snpEff databases {[assembly]}") 5362 databases_download_snpeff( 5363 folder=snpeff_databases, assemblies=[assembly], config=config 5364 ) 5365 5366 # Export VCF file 5367 self.export_variant_vcf( 5368 vcf_file=tmp_vcf_name, 5369 remove_info=True, 5370 add_samples=False, 5371 index=True, 5372 ) 5373 5374 # Tmp file 5375 err_files = [] 5376 tmp_annotate_vcf = NamedTemporaryFile( 5377 prefix=self.get_prefix(), 5378 dir=self.get_tmp_dir(), 5379 suffix=".vcf", 5380 delete=False, 5381 ) 5382 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5383 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5384 err_files.append(tmp_annotate_vcf_name_err) 5385 5386 # Command 5387 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5388 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5389 run_parallel_commands([snpeff_command], 1) 5390 5391 # Error messages 5392 log.info(f"Error/Warning messages:") 5393 error_message_command_all = [] 5394 error_message_command_warning = [] 5395 error_message_command_err = [] 5396 for err_file in err_files: 5397 with open(err_file, "r") as f: 5398 for line in f: 5399 message = line.strip() 5400 error_message_command_all.append(message) 5401 if line.startswith("[W::"): 5402 error_message_command_warning.append(message) 5403 if line.startswith("[E::"): 5404 error_message_command_err.append(f"{err_file}: " + message) 5405 # log info 5406 for message in list( 5407 set(error_message_command_err + error_message_command_warning) 5408 ): 5409 log.info(f" {message}") 5410 # debug info 5411 for message in list(set(error_message_command_all)): 5412 log.debug(f" {message}") 5413 # failed 5414 if len(error_message_command_err): 5415 log.error("Annotation failed: Error in commands") 5416 raise ValueError("Annotation failed: Error in commands") 5417 5418 # Find annotation in header 5419 with open(tmp_annotate_vcf_name, "rt") as f: 5420 header_list = self.read_vcf_header(f) 5421 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5422 5423 for ann in annovar_vcf_header.infos: 5424 if ann not in self.get_header().infos: 5425 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5426 5427 # Update variants 5428 log.info(f"Annotation - Updating...") 5429 self.update_from_vcf(tmp_annotate_vcf_name) 5430 5431 else: 5432 if "ANN" in self.get_header().infos: 5433 log.debug(f"Existing snpEff annotations in VCF") 5434 if force_update_annotation: 5435 log.debug(f"Existing snpEff annotations in VCF - annotation forced")
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
5437 def annotation_annovar(self, threads: int = None) -> None: 5438 """ 5439 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5440 annotations 5441 5442 :param threads: number of threads to use 5443 :return: the value of the variable "return_value". 5444 """ 5445 5446 # DEBUG 5447 log.debug("Start annotation with Annovar databases") 5448 5449 # Threads 5450 if not threads: 5451 threads = self.get_threads() 5452 log.debug("Threads: " + str(threads)) 5453 5454 # Tmp en Err files 5455 tmp_files = [] 5456 err_files = [] 5457 5458 # DEBUG 5459 delete_tmp = True 5460 if self.get_config().get("verbosity", "warning") in ["debug"]: 5461 delete_tmp = False 5462 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5463 5464 # Config 5465 config = self.get_config() 5466 log.debug("Config: " + str(config)) 5467 5468 # Config - Folders - Databases 5469 databases_folders = ( 5470 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5471 ) 5472 log.debug("Databases annotations: " + str(databases_folders)) 5473 5474 # Config - annovar bin command 5475 annovar_bin_command = get_bin_command( 5476 bin="table_annovar.pl", 5477 tool="annovar", 5478 bin_type="perl", 5479 config=config, 5480 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5481 ) 5482 if not annovar_bin_command: 5483 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5484 log.error(msg_err) 5485 raise ValueError(msg_err) 5486 5487 # Config - BCFTools bin command 5488 bcftools_bin_command = get_bin_command( 5489 bin="bcftools", 5490 tool="bcftools", 5491 bin_type="bin", 5492 config=config, 5493 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5494 ) 5495 if not bcftools_bin_command: 5496 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5497 log.error(msg_err) 5498 raise ValueError(msg_err) 5499 5500 # Config - annovar databases 5501 annovar_databases = ( 5502 config.get("folders", {}) 5503 .get("databases", {}) 5504 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5505 ) 5506 if annovar_databases is not None: 5507 if isinstance(annovar_databases, list): 5508 annovar_databases = full_path(annovar_databases[0]) 5509 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5510 annovar_databases = full_path(annovar_databases) 5511 if not os.path.exists(annovar_databases): 5512 log.info(f"Annovar databases folder '{annovar_databases}' created") 5513 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5514 else: 5515 msg_err = f"Annovar databases configuration failed" 5516 log.error(msg_err) 5517 raise ValueError(msg_err) 5518 5519 # Param 5520 param = self.get_param() 5521 log.debug("Param: " + str(param)) 5522 5523 # Param - options 5524 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5525 log.debug("Options: " + str(options)) 5526 5527 # Param - annotations 5528 annotations = ( 5529 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5530 ) 5531 log.debug("Annotations: " + str(annotations)) 5532 5533 # Param - Assembly 5534 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5535 5536 # Annovar database assembly 5537 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5538 if annovar_databases_assembly != "" and not os.path.exists( 5539 annovar_databases_assembly 5540 ): 5541 os.makedirs(annovar_databases_assembly) 5542 5543 # Data 5544 table_variants = self.get_table_variants() 5545 5546 # Check if not empty 5547 log.debug("Check if not empty") 5548 sql_query_chromosomes = ( 5549 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5550 ) 5551 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5552 if not sql_query_chromosomes_df["count"][0]: 5553 log.info(f"VCF empty") 5554 return 5555 5556 # VCF header 5557 vcf_reader = self.get_header() 5558 log.debug("Initial header: " + str(vcf_reader.infos)) 5559 5560 # Existing annotations 5561 for vcf_annotation in self.get_header().infos: 5562 5563 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5564 log.debug( 5565 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5566 ) 5567 5568 force_update_annotation = True 5569 5570 if annotations: 5571 5572 commands = [] 5573 tmp_annotates_vcf_name_list = [] 5574 5575 # Export in VCF 5576 log.debug("Create initial file to annotate") 5577 tmp_vcf = NamedTemporaryFile( 5578 prefix=self.get_prefix(), 5579 dir=self.get_tmp_dir(), 5580 suffix=".vcf.gz", 5581 delete=False, 5582 ) 5583 tmp_vcf_name = tmp_vcf.name 5584 tmp_files.append(tmp_vcf_name) 5585 tmp_files.append(tmp_vcf_name + ".tbi") 5586 5587 # Export VCF file 5588 self.export_variant_vcf( 5589 vcf_file=tmp_vcf_name, 5590 remove_info=".", 5591 add_samples=False, 5592 index=True, 5593 ) 5594 5595 # Create file for field rename 5596 log.debug("Create file for field rename") 5597 tmp_rename = NamedTemporaryFile( 5598 prefix=self.get_prefix(), 5599 dir=self.get_tmp_dir(), 5600 suffix=".rename", 5601 delete=False, 5602 ) 5603 tmp_rename_name = tmp_rename.name 5604 tmp_files.append(tmp_rename_name) 5605 5606 # Check Annovar database 5607 log.debug( 5608 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5609 ) 5610 databases_download_annovar( 5611 folder=annovar_databases, 5612 files=list(annotations.keys()), 5613 assemblies=[assembly], 5614 ) 5615 5616 for annotation in annotations: 5617 annotation_fields = annotations[annotation] 5618 5619 if not annotation_fields: 5620 annotation_fields = {"INFO": None} 5621 5622 log.info(f"Annotations Annovar - database '{annotation}'") 5623 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5624 5625 # Tmp file for annovar 5626 err_files = [] 5627 tmp_annotate_vcf_directory = TemporaryDirectory( 5628 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5629 ) 5630 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5631 tmp_annotate_vcf_name_annovar = ( 5632 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5633 ) 5634 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5635 err_files.append(tmp_annotate_vcf_name_err) 5636 tmp_files.append(tmp_annotate_vcf_name_err) 5637 5638 # Tmp file final vcf annotated by annovar 5639 tmp_annotate_vcf = NamedTemporaryFile( 5640 prefix=self.get_prefix(), 5641 dir=self.get_tmp_dir(), 5642 suffix=".vcf.gz", 5643 delete=False, 5644 ) 5645 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5646 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5647 tmp_files.append(tmp_annotate_vcf_name) 5648 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5649 5650 # Number of fields 5651 annotation_list = [] 5652 annotation_renamed_list = [] 5653 5654 for annotation_field in annotation_fields: 5655 5656 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5657 annotation_fields_new_name = annotation_fields.get( 5658 annotation_field, annotation_field 5659 ) 5660 if not annotation_fields_new_name: 5661 annotation_fields_new_name = annotation_field 5662 5663 if ( 5664 force_update_annotation 5665 or annotation_fields_new_name not in self.get_header().infos 5666 ): 5667 annotation_list.append(annotation_field) 5668 annotation_renamed_list.append(annotation_fields_new_name) 5669 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5670 log.warning( 5671 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5672 ) 5673 5674 # Add rename info 5675 run_parallel_commands( 5676 [ 5677 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5678 ], 5679 1, 5680 ) 5681 5682 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5683 log.debug("annotation_list: " + str(annotation_list)) 5684 5685 # protocol 5686 protocol = annotation 5687 5688 # argument 5689 argument = "" 5690 5691 # operation 5692 operation = "f" 5693 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5694 "ensGene" 5695 ): 5696 operation = "g" 5697 if options.get("genebase", None): 5698 argument = f"""'{options.get("genebase","")}'""" 5699 elif annotation in ["cytoBand"]: 5700 operation = "r" 5701 5702 # argument option 5703 argument_option = "" 5704 if argument != "": 5705 argument_option = " --argument " + argument 5706 5707 # command options 5708 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5709 for option in options: 5710 if option not in ["genebase"]: 5711 command_options += f""" --{option}={options[option]}""" 5712 5713 # Command 5714 5715 # Command - Annovar 5716 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5717 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5718 5719 # Command - start pipe 5720 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5721 5722 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5723 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5724 5725 # Command - Special characters (refGene annotation) 5726 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5727 5728 # Command - Clean empty fields (with value ".") 5729 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5730 5731 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5732 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5733 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5734 # for ann in annotation_renamed_list: 5735 for ann in annotation_list: 5736 annovar_fields_to_keep.append(f"^INFO/{ann}") 5737 5738 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5739 5740 # Command - indexing 5741 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5742 5743 log.debug(f"Annotation - Annovar command: {command_annovar}") 5744 run_parallel_commands([command_annovar], 1) 5745 5746 # Error messages 5747 log.info(f"Error/Warning messages:") 5748 error_message_command_all = [] 5749 error_message_command_warning = [] 5750 error_message_command_err = [] 5751 for err_file in err_files: 5752 with open(err_file, "r") as f: 5753 for line in f: 5754 message = line.strip() 5755 error_message_command_all.append(message) 5756 if line.startswith("[W::") or line.startswith("WARNING"): 5757 error_message_command_warning.append(message) 5758 if line.startswith("[E::") or line.startswith("ERROR"): 5759 error_message_command_err.append( 5760 f"{err_file}: " + message 5761 ) 5762 # log info 5763 for message in list( 5764 set(error_message_command_err + error_message_command_warning) 5765 ): 5766 log.info(f" {message}") 5767 # debug info 5768 for message in list(set(error_message_command_all)): 5769 log.debug(f" {message}") 5770 # failed 5771 if len(error_message_command_err): 5772 log.error("Annotation failed: Error in commands") 5773 raise ValueError("Annotation failed: Error in commands") 5774 5775 if tmp_annotates_vcf_name_list: 5776 5777 # List of annotated files 5778 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5779 5780 # Tmp file 5781 tmp_annotate_vcf = NamedTemporaryFile( 5782 prefix=self.get_prefix(), 5783 dir=self.get_tmp_dir(), 5784 suffix=".vcf.gz", 5785 delete=False, 5786 ) 5787 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5788 tmp_files.append(tmp_annotate_vcf_name) 5789 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5790 err_files.append(tmp_annotate_vcf_name_err) 5791 tmp_files.append(tmp_annotate_vcf_name_err) 5792 5793 # Command merge 5794 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5795 log.info( 5796 f"Annotation Annovar - Annotation merging " 5797 + str(len(tmp_annotates_vcf_name_list)) 5798 + " annotated files" 5799 ) 5800 log.debug(f"Annotation - merge command: {merge_command}") 5801 run_parallel_commands([merge_command], 1) 5802 5803 # Find annotation in header 5804 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5805 header_list = self.read_vcf_header(f) 5806 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5807 5808 for ann in annovar_vcf_header.infos: 5809 if ann not in self.get_header().infos: 5810 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5811 5812 # Update variants 5813 log.info(f"Annotation Annovar - Updating...") 5814 self.update_from_vcf(tmp_annotate_vcf_name) 5815 5816 # Clean files 5817 # Tmp file remove command 5818 if True: 5819 tmp_files_remove_command = "" 5820 if tmp_files: 5821 tmp_files_remove_command = " ".join(tmp_files) 5822 clean_command = f" rm -f {tmp_files_remove_command} " 5823 log.debug(f"Annotation Annovar - Annotation cleaning ") 5824 log.debug(f"Annotation - cleaning command: {clean_command}") 5825 run_parallel_commands([clean_command], 1)
It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations
Parameters
- threads: number of threads to use
Returns
the value of the variable "return_value".
5828 def annotation_parquet(self, threads: int = None) -> None: 5829 """ 5830 It takes a VCF file, and annotates it with a parquet file 5831 5832 :param threads: number of threads to use for the annotation 5833 :return: the value of the variable "result". 5834 """ 5835 5836 # DEBUG 5837 log.debug("Start annotation with parquet databases") 5838 5839 # Threads 5840 if not threads: 5841 threads = self.get_threads() 5842 log.debug("Threads: " + str(threads)) 5843 5844 # DEBUG 5845 delete_tmp = True 5846 if self.get_config().get("verbosity", "warning") in ["debug"]: 5847 delete_tmp = False 5848 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5849 5850 # Config 5851 databases_folders = set( 5852 self.get_config() 5853 .get("folders", {}) 5854 .get("databases", {}) 5855 .get("annotations", ["."]) 5856 + self.get_config() 5857 .get("folders", {}) 5858 .get("databases", {}) 5859 .get("parquet", ["."]) 5860 ) 5861 log.debug("Databases annotations: " + str(databases_folders)) 5862 5863 # Param 5864 annotations = ( 5865 self.get_param() 5866 .get("annotation", {}) 5867 .get("parquet", {}) 5868 .get("annotations", None) 5869 ) 5870 log.debug("Annotations: " + str(annotations)) 5871 5872 # Assembly 5873 assembly = self.get_param().get( 5874 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5875 ) 5876 5877 # Force Update Annotation 5878 force_update_annotation = ( 5879 self.get_param() 5880 .get("annotation", {}) 5881 .get("options", {}) 5882 .get("annotations_update", False) 5883 ) 5884 log.debug(f"force_update_annotation={force_update_annotation}") 5885 force_append_annotation = ( 5886 self.get_param() 5887 .get("annotation", {}) 5888 .get("options", {}) 5889 .get("annotations_append", False) 5890 ) 5891 log.debug(f"force_append_annotation={force_append_annotation}") 5892 5893 # Data 5894 table_variants = self.get_table_variants() 5895 5896 # Check if not empty 5897 log.debug("Check if not empty") 5898 sql_query_chromosomes_df = self.get_query_to_df( 5899 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5900 ) 5901 if not sql_query_chromosomes_df["count"][0]: 5902 log.info(f"VCF empty") 5903 return 5904 5905 # VCF header 5906 vcf_reader = self.get_header() 5907 log.debug("Initial header: " + str(vcf_reader.infos)) 5908 5909 # Nb Variants POS 5910 log.debug("NB Variants Start") 5911 nb_variants = self.conn.execute( 5912 f"SELECT count(*) AS count FROM variants" 5913 ).fetchdf()["count"][0] 5914 log.debug("NB Variants Stop") 5915 5916 # Existing annotations 5917 for vcf_annotation in self.get_header().infos: 5918 5919 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5920 log.debug( 5921 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5922 ) 5923 5924 # Added columns 5925 added_columns = [] 5926 5927 # drop indexes 5928 log.debug(f"Drop indexes...") 5929 self.drop_indexes() 5930 5931 if annotations: 5932 5933 if "ALL" in annotations: 5934 5935 all_param = annotations.get("ALL", {}) 5936 all_param_formats = all_param.get("formats", None) 5937 all_param_releases = all_param.get("releases", None) 5938 5939 databases_infos_dict = self.scan_databases( 5940 database_formats=all_param_formats, 5941 database_releases=all_param_releases, 5942 ) 5943 for database_infos in databases_infos_dict.keys(): 5944 if database_infos not in annotations: 5945 annotations[database_infos] = {"INFO": None} 5946 5947 for annotation in annotations: 5948 5949 if annotation in ["ALL"]: 5950 continue 5951 5952 # Annotation Name 5953 annotation_name = os.path.basename(annotation) 5954 5955 # Annotation fields 5956 annotation_fields = annotations[annotation] 5957 if not annotation_fields: 5958 annotation_fields = {"INFO": None} 5959 5960 log.debug(f"Annotation '{annotation_name}'") 5961 log.debug( 5962 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5963 ) 5964 5965 # Create Database 5966 database = Database( 5967 database=annotation, 5968 databases_folders=databases_folders, 5969 assembly=assembly, 5970 ) 5971 5972 # Find files 5973 parquet_file = database.get_database() 5974 parquet_hdr_file = database.get_header_file() 5975 parquet_type = database.get_type() 5976 5977 # Check if files exists 5978 if not parquet_file or not parquet_hdr_file: 5979 msg_err_list = [] 5980 if not parquet_file: 5981 msg_err_list.append( 5982 f"Annotation failed: Annotation file not found" 5983 ) 5984 if parquet_file and not parquet_hdr_file: 5985 msg_err_list.append( 5986 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 5987 ) 5988 5989 log.error(". ".join(msg_err_list)) 5990 raise ValueError(". ".join(msg_err_list)) 5991 else: 5992 # Get parquet connexion 5993 parquet_sql_attach = database.get_sql_database_attach( 5994 output="query" 5995 ) 5996 if parquet_sql_attach: 5997 self.conn.execute(parquet_sql_attach) 5998 parquet_file_link = database.get_sql_database_link() 5999 # Log 6000 log.debug( 6001 f"Annotation '{annotation_name}' - file: " 6002 + str(parquet_file) 6003 + " and " 6004 + str(parquet_hdr_file) 6005 ) 6006 6007 # Database full header columns 6008 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6009 parquet_hdr_file 6010 ) 6011 # Log 6012 log.debug( 6013 "Annotation database header columns : " 6014 + str(parquet_hdr_vcf_header_columns) 6015 ) 6016 6017 # Load header as VCF object 6018 parquet_hdr_vcf_header_infos = database.get_header().infos 6019 # Log 6020 log.debug( 6021 "Annotation database header: " 6022 + str(parquet_hdr_vcf_header_infos) 6023 ) 6024 6025 # Get extra infos 6026 parquet_columns = database.get_extra_columns() 6027 # Log 6028 log.debug("Annotation database Columns: " + str(parquet_columns)) 6029 6030 # Add extra columns if "ALL" in annotation_fields 6031 # if "ALL" in annotation_fields: 6032 # allow_add_extra_column = True 6033 if "ALL" in annotation_fields and database.get_extra_columns(): 6034 for extra_column in database.get_extra_columns(): 6035 if ( 6036 extra_column not in annotation_fields 6037 and extra_column.replace("INFO/", "") 6038 not in parquet_hdr_vcf_header_infos 6039 ): 6040 parquet_hdr_vcf_header_infos[extra_column] = ( 6041 vcf.parser._Info( 6042 extra_column, 6043 ".", 6044 "String", 6045 f"{extra_column} description", 6046 "unknown", 6047 "unknown", 6048 self.code_type_map["String"], 6049 ) 6050 ) 6051 6052 # For all fields in database 6053 annotation_fields_all = False 6054 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6055 annotation_fields_all = True 6056 annotation_fields = { 6057 key: key for key in parquet_hdr_vcf_header_infos 6058 } 6059 6060 log.debug( 6061 "Annotation database header - All annotations added: " 6062 + str(annotation_fields) 6063 ) 6064 6065 # Init 6066 6067 # List of annotation fields to use 6068 sql_query_annotation_update_info_sets = [] 6069 6070 # List of annotation to agregate 6071 sql_query_annotation_to_agregate = [] 6072 6073 # Number of fields 6074 nb_annotation_field = 0 6075 6076 # Annotation fields processed 6077 annotation_fields_processed = [] 6078 6079 # Columns mapping 6080 map_columns = database.map_columns( 6081 columns=annotation_fields, prefixes=["INFO/"] 6082 ) 6083 6084 # Query dict for fields to remove (update option) 6085 query_dict_remove = {} 6086 6087 # Fetch Anotation fields 6088 for annotation_field in annotation_fields: 6089 6090 # annotation_field_column 6091 annotation_field_column = map_columns.get( 6092 annotation_field, "INFO" 6093 ) 6094 6095 # field new name, if parametered 6096 annotation_fields_new_name = annotation_fields.get( 6097 annotation_field, annotation_field 6098 ) 6099 if not annotation_fields_new_name: 6100 annotation_fields_new_name = annotation_field 6101 6102 # To annotate 6103 # force_update_annotation = True 6104 # force_append_annotation = True 6105 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6106 if annotation_field in parquet_hdr_vcf_header_infos and ( 6107 force_update_annotation 6108 or force_append_annotation 6109 or ( 6110 annotation_fields_new_name 6111 not in self.get_header().infos 6112 ) 6113 ): 6114 6115 # Add field to annotation to process list 6116 annotation_fields_processed.append( 6117 annotation_fields_new_name 6118 ) 6119 6120 # explode infos for the field 6121 annotation_fields_new_name_info_msg = "" 6122 if ( 6123 force_update_annotation 6124 and annotation_fields_new_name 6125 in self.get_header().infos 6126 ): 6127 # Remove field from INFO 6128 query = f""" 6129 UPDATE {table_variants} as table_variants 6130 SET INFO = REGEXP_REPLACE( 6131 concat(table_variants.INFO,''), 6132 ';*{annotation_fields_new_name}=[^;]*', 6133 '' 6134 ) 6135 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6136 """ 6137 annotation_fields_new_name_info_msg = " [update]" 6138 query_dict_remove[ 6139 f"remove 'INFO/{annotation_fields_new_name}'" 6140 ] = query 6141 6142 # Sep between fields in INFO 6143 nb_annotation_field += 1 6144 if nb_annotation_field > 1: 6145 annotation_field_sep = ";" 6146 else: 6147 annotation_field_sep = "" 6148 6149 log.info( 6150 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6151 ) 6152 6153 # Add INFO field to header 6154 parquet_hdr_vcf_header_infos_number = ( 6155 parquet_hdr_vcf_header_infos[annotation_field].num 6156 or "." 6157 ) 6158 parquet_hdr_vcf_header_infos_type = ( 6159 parquet_hdr_vcf_header_infos[annotation_field].type 6160 or "String" 6161 ) 6162 parquet_hdr_vcf_header_infos_description = ( 6163 parquet_hdr_vcf_header_infos[annotation_field].desc 6164 or f"{annotation_field} description" 6165 ) 6166 parquet_hdr_vcf_header_infos_source = ( 6167 parquet_hdr_vcf_header_infos[annotation_field].source 6168 or "unknown" 6169 ) 6170 parquet_hdr_vcf_header_infos_version = ( 6171 parquet_hdr_vcf_header_infos[annotation_field].version 6172 or "unknown" 6173 ) 6174 6175 vcf_reader.infos[annotation_fields_new_name] = ( 6176 vcf.parser._Info( 6177 annotation_fields_new_name, 6178 parquet_hdr_vcf_header_infos_number, 6179 parquet_hdr_vcf_header_infos_type, 6180 parquet_hdr_vcf_header_infos_description, 6181 parquet_hdr_vcf_header_infos_source, 6182 parquet_hdr_vcf_header_infos_version, 6183 self.code_type_map[ 6184 parquet_hdr_vcf_header_infos_type 6185 ], 6186 ) 6187 ) 6188 6189 # Append 6190 if force_append_annotation: 6191 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6192 else: 6193 query_case_when_append = "" 6194 6195 # Annotation/Update query fields 6196 # Found in INFO column 6197 if ( 6198 annotation_field_column == "INFO" 6199 and "INFO" in parquet_hdr_vcf_header_columns 6200 ): 6201 sql_query_annotation_update_info_sets.append( 6202 f""" 6203 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6204 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6205 ELSE '' 6206 END 6207 """ 6208 ) 6209 # Found in a specific column 6210 else: 6211 sql_query_annotation_update_info_sets.append( 6212 f""" 6213 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6214 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6215 ELSE '' 6216 END 6217 """ 6218 ) 6219 sql_query_annotation_to_agregate.append( 6220 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6221 ) 6222 6223 # Not to annotate 6224 else: 6225 6226 if force_update_annotation: 6227 annotation_message = "forced" 6228 else: 6229 annotation_message = "skipped" 6230 6231 if annotation_field not in parquet_hdr_vcf_header_infos: 6232 log.warning( 6233 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6234 ) 6235 if annotation_fields_new_name in self.get_header().infos: 6236 log.warning( 6237 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6238 ) 6239 6240 # Check if ALL fields have to be annotated. Thus concat all INFO field 6241 # allow_annotation_full_info = True 6242 allow_annotation_full_info = not force_append_annotation 6243 6244 if parquet_type in ["regions"]: 6245 allow_annotation_full_info = False 6246 6247 if ( 6248 allow_annotation_full_info 6249 and nb_annotation_field == len(annotation_fields) 6250 and annotation_fields_all 6251 and ( 6252 "INFO" in parquet_hdr_vcf_header_columns 6253 and "INFO" in database.get_extra_columns() 6254 ) 6255 ): 6256 log.debug("Column INFO annotation enabled") 6257 sql_query_annotation_update_info_sets = [] 6258 sql_query_annotation_update_info_sets.append( 6259 f" table_parquet.INFO " 6260 ) 6261 6262 if sql_query_annotation_update_info_sets: 6263 6264 # Annotate 6265 log.info(f"Annotation '{annotation_name}' - Annotation...") 6266 6267 # Join query annotation update info sets for SQL 6268 sql_query_annotation_update_info_sets_sql = ",".join( 6269 sql_query_annotation_update_info_sets 6270 ) 6271 6272 # Check chromosomes list (and variants infos) 6273 sql_query_chromosomes = f""" 6274 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6275 FROM {table_variants} as table_variants 6276 GROUP BY table_variants."#CHROM" 6277 ORDER BY table_variants."#CHROM" 6278 """ 6279 sql_query_chromosomes_df = self.conn.execute( 6280 sql_query_chromosomes 6281 ).df() 6282 sql_query_chromosomes_dict = { 6283 entry["CHROM"]: { 6284 "count": entry["count_variants"], 6285 "min": entry["min_variants"], 6286 "max": entry["max_variants"], 6287 } 6288 for index, entry in sql_query_chromosomes_df.iterrows() 6289 } 6290 6291 # Init 6292 nb_of_query = 0 6293 nb_of_variant_annotated = 0 6294 query_dict = query_dict_remove 6295 6296 # for chrom in sql_query_chromosomes_df["CHROM"]: 6297 for chrom in sql_query_chromosomes_dict: 6298 6299 # Number of variant by chromosome 6300 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6301 chrom, {} 6302 ).get("count", 0) 6303 6304 log.debug( 6305 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6306 ) 6307 6308 # Annotation with regions database 6309 if parquet_type in ["regions"]: 6310 sql_query_annotation_from_clause = f""" 6311 FROM ( 6312 SELECT 6313 '{chrom}' AS \"#CHROM\", 6314 table_variants_from.\"POS\" AS \"POS\", 6315 {",".join(sql_query_annotation_to_agregate)} 6316 FROM {table_variants} as table_variants_from 6317 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6318 table_parquet_from."#CHROM" = '{chrom}' 6319 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6320 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6321 ) 6322 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6323 GROUP BY table_variants_from.\"POS\" 6324 ) 6325 as table_parquet 6326 """ 6327 6328 sql_query_annotation_where_clause = """ 6329 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6330 AND table_parquet.\"POS\" = table_variants.\"POS\" 6331 """ 6332 6333 # Annotation with variants database 6334 else: 6335 sql_query_annotation_from_clause = f""" 6336 FROM {parquet_file_link} as table_parquet 6337 """ 6338 sql_query_annotation_where_clause = f""" 6339 table_variants."#CHROM" = '{chrom}' 6340 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6341 AND table_parquet.\"POS\" = table_variants.\"POS\" 6342 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6343 AND table_parquet.\"REF\" = table_variants.\"REF\" 6344 """ 6345 6346 # Create update query 6347 sql_query_annotation_chrom_interval_pos = f""" 6348 UPDATE {table_variants} as table_variants 6349 SET INFO = 6350 concat( 6351 CASE WHEN table_variants.INFO NOT IN ('','.') 6352 THEN table_variants.INFO 6353 ELSE '' 6354 END 6355 , 6356 CASE WHEN table_variants.INFO NOT IN ('','.') 6357 AND ( 6358 concat({sql_query_annotation_update_info_sets_sql}) 6359 ) 6360 NOT IN ('','.') 6361 THEN ';' 6362 ELSE '' 6363 END 6364 , 6365 {sql_query_annotation_update_info_sets_sql} 6366 ) 6367 {sql_query_annotation_from_clause} 6368 WHERE {sql_query_annotation_where_clause} 6369 ; 6370 """ 6371 6372 # Add update query to dict 6373 query_dict[ 6374 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6375 ] = sql_query_annotation_chrom_interval_pos 6376 6377 nb_of_query = len(query_dict) 6378 num_query = 0 6379 6380 # SET max_expression_depth TO x 6381 self.conn.execute("SET max_expression_depth TO 10000") 6382 6383 for query_name in query_dict: 6384 query = query_dict[query_name] 6385 num_query += 1 6386 log.info( 6387 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6388 ) 6389 result = self.conn.execute(query) 6390 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6391 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6392 log.info( 6393 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6394 ) 6395 6396 log.info( 6397 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6398 ) 6399 6400 else: 6401 6402 log.info( 6403 f"Annotation '{annotation_name}' - No Annotations available" 6404 ) 6405 6406 log.debug("Final header: " + str(vcf_reader.infos)) 6407 6408 # Remove added columns 6409 for added_column in added_columns: 6410 self.drop_column(column=added_column)
It takes a VCF file, and annotates it with a parquet file
Parameters
- threads: number of threads to use for the annotation
Returns
the value of the variable "result".
6412 def annotation_splice(self, threads: int = None) -> None: 6413 """ 6414 This function annotate with snpEff 6415 6416 :param threads: The number of threads to use 6417 :return: the value of the variable "return_value". 6418 """ 6419 6420 # DEBUG 6421 log.debug("Start annotation with splice tools") 6422 6423 # Threads 6424 if not threads: 6425 threads = self.get_threads() 6426 log.debug("Threads: " + str(threads)) 6427 6428 # DEBUG 6429 delete_tmp = True 6430 if self.get_config().get("verbosity", "warning") in ["debug"]: 6431 delete_tmp = False 6432 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6433 6434 # Config 6435 config = self.get_config() 6436 log.debug("Config: " + str(config)) 6437 splice_config = config.get("tools", {}).get("splice", {}) 6438 if not splice_config: 6439 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6440 msg_err = "No Splice tool config" 6441 raise ValueError(msg_err) 6442 log.debug(f"splice_config: {splice_config}") 6443 6444 # Config - Folders - Databases 6445 databases_folders = ( 6446 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6447 ) 6448 log.debug("Databases annotations: " + str(databases_folders)) 6449 6450 # Splice docker image 6451 splice_docker_image = splice_config.get("docker").get("image") 6452 6453 # Pull splice image if it's not already there 6454 if not check_docker_image_exists(splice_docker_image): 6455 log.warning( 6456 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6457 ) 6458 try: 6459 command(f"docker pull {splice_config.get('docker').get('image')}") 6460 except subprocess.CalledProcessError: 6461 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6462 log.error(msg_err) 6463 raise ValueError(msg_err) 6464 6465 # Config - splice databases 6466 splice_databases = ( 6467 config.get("folders", {}) 6468 .get("databases", {}) 6469 .get("splice", DEFAULT_SPLICE_FOLDER) 6470 ) 6471 splice_databases = full_path(splice_databases) 6472 6473 # Param 6474 param = self.get_param() 6475 log.debug("Param: " + str(param)) 6476 6477 # Param 6478 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6479 log.debug("Options: " + str(options)) 6480 6481 # Data 6482 table_variants = self.get_table_variants() 6483 6484 # Check if not empty 6485 log.debug("Check if not empty") 6486 sql_query_chromosomes = ( 6487 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6488 ) 6489 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6490 log.info("VCF empty") 6491 return None 6492 6493 # Export in VCF 6494 log.debug("Create initial file to annotate") 6495 6496 # Create output folder / work folder 6497 if options.get("output_folder", ""): 6498 output_folder = options.get("output_folder", "") 6499 if not os.path.exists(output_folder): 6500 Path(output_folder).mkdir(parents=True, exist_ok=True) 6501 else: 6502 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6503 if not os.path.exists(output_folder): 6504 Path(output_folder).mkdir(parents=True, exist_ok=True) 6505 6506 if options.get("workdir", ""): 6507 workdir = options.get("workdir", "") 6508 else: 6509 workdir = "/work" 6510 6511 # Create tmp VCF file 6512 tmp_vcf = NamedTemporaryFile( 6513 prefix=self.get_prefix(), 6514 dir=output_folder, 6515 suffix=".vcf", 6516 delete=False, 6517 ) 6518 tmp_vcf_name = tmp_vcf.name 6519 6520 # VCF header 6521 header = self.get_header() 6522 6523 # Existing annotations 6524 for vcf_annotation in self.get_header().infos: 6525 6526 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6527 log.debug( 6528 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6529 ) 6530 6531 # Memory limit 6532 if config.get("memory", None): 6533 memory_limit = config.get("memory", "8G").upper() 6534 # upper() 6535 else: 6536 memory_limit = "8G" 6537 log.debug(f"memory_limit: {memory_limit}") 6538 6539 # Check number of variants to annotate 6540 where_clause_regex_spliceai = r"SpliceAI_\w+" 6541 where_clause_regex_spip = r"SPiP_\w+" 6542 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6543 df_list_of_variants_to_annotate = self.get_query_to_df( 6544 query=f""" SELECT * FROM variants {where_clause} """ 6545 ) 6546 if len(df_list_of_variants_to_annotate) == 0: 6547 log.warning( 6548 f"No variants to annotate with splice. Variants probably already annotated with splice" 6549 ) 6550 return None 6551 else: 6552 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6553 6554 # Export VCF file 6555 self.export_variant_vcf( 6556 vcf_file=tmp_vcf_name, 6557 remove_info=True, 6558 add_samples=True, 6559 index=False, 6560 where_clause=where_clause, 6561 ) 6562 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6563 if any(value for value in splice_config.values() if value is None): 6564 log.warning("At least one splice config parameter is empty") 6565 # exit annotation_splice 6566 return None 6567 6568 # Params in splice nf 6569 def check_values(dico: dict): 6570 """ 6571 Ensure parameters for NF splice pipeline 6572 """ 6573 for key, val in dico.items(): 6574 if key == "genome": 6575 if any( 6576 assemb in options.get("genome", {}) 6577 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6578 ): 6579 yield f"--{key} hg19" 6580 elif any( 6581 assemb in options.get("genome", {}) 6582 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6583 ): 6584 yield f"--{key} hg38" 6585 elif ( 6586 (isinstance(val, str) and val) 6587 or isinstance(val, int) 6588 or isinstance(val, bool) 6589 ): 6590 yield f"--{key} {val}" 6591 6592 # Genome 6593 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6594 options["genome"] = genome 6595 # NF params 6596 nf_params = [] 6597 # Add options 6598 if options: 6599 log.debug(options) 6600 nf_params = list(check_values(options)) 6601 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6602 else: 6603 log.debug("No NF params provided") 6604 # Add threads 6605 if "threads" not in options.keys(): 6606 nf_params.append(f"--threads {threads}") 6607 # Genome path 6608 genome_path = find_genome( 6609 config.get("folders", {}) 6610 .get("databases", {}) 6611 .get("genomes", DEFAULT_GENOME_FOLDER), 6612 file=f"{genome}.fa", 6613 ) 6614 # Add genome path 6615 if not genome_path: 6616 raise ValueError( 6617 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6618 ) 6619 else: 6620 log.debug(f"Genome: {genome_path}") 6621 nf_params.append(f"--genome_path {genome_path}") 6622 6623 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6624 """ 6625 Setting up updated databases for SPiP and SpliceAI 6626 """ 6627 6628 try: 6629 6630 # SpliceAI assembly transcriptome 6631 spliceai_assembly = os.path.join( 6632 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6633 options.get("genome"), 6634 "transcriptome", 6635 ) 6636 spip_assembly = options.get("genome") 6637 6638 spip = find( 6639 f"transcriptome_{spip_assembly}.RData", 6640 config.get("folders", {}).get("databases", {}).get("spip", {}), 6641 ) 6642 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6643 log.debug(f"SPiP annotations: {spip}") 6644 log.debug(f"SpliceAI annotations: {spliceai}") 6645 if spip and spliceai: 6646 return [ 6647 f"--spip_transcriptome {spip}", 6648 f"--spliceai_transcriptome {spliceai}", 6649 ] 6650 else: 6651 log.warning( 6652 "Can't find splice databases in configuration, use annotations file from image" 6653 ) 6654 except TypeError: 6655 log.warning( 6656 "Can't find splice databases in configuration, use annotations file from image" 6657 ) 6658 return [] 6659 6660 # Add options, check if transcriptome option have already beend provided 6661 if ( 6662 "spip_transcriptome" not in nf_params 6663 and "spliceai_transcriptome" not in nf_params 6664 ): 6665 splice_reference = splice_annotations(options, config) 6666 if splice_reference: 6667 nf_params.extend(splice_reference) 6668 # nf_params.append(f"--output_folder {output_folder}") 6669 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6670 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6671 log.debug(cmd) 6672 splice_config["docker"]["command"] = cmd 6673 6674 # Ensure proxy is set 6675 proxy = [ 6676 f"-e {var}={os.getenv(var)}" 6677 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6678 if os.getenv(var) is not None 6679 ] 6680 docker_cmd = get_bin_command( 6681 tool="splice", 6682 bin_type="docker", 6683 config=config, 6684 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6685 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6686 ) 6687 # print(docker_cmd) 6688 # exit() 6689 # Docker debug 6690 # if splice_config.get("rm_container"): 6691 # rm_container = "--rm" 6692 # else: 6693 # rm_container = "" 6694 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6695 log.debug(docker_cmd) 6696 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6697 log.debug(res.stdout) 6698 if res.stderr: 6699 log.error(res.stderr) 6700 res.check_returncode() 6701 # Update variants 6702 log.info("Annotation - Updating...") 6703 # Test find output vcf 6704 log.debug( 6705 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6706 ) 6707 output_vcf = [] 6708 # Wrong folder to look in 6709 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6710 if ( 6711 files 6712 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6713 ): 6714 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6715 # log.debug(os.listdir(options.get("output_folder"))) 6716 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6717 if not output_vcf: 6718 log.debug( 6719 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6720 ) 6721 else: 6722 # Get new header from annotated vcf 6723 log.debug(f"Initial header: {len(header.infos)} fields") 6724 # Create new header with splice infos 6725 new_vcf = Variants(input=output_vcf[0]) 6726 new_vcf_header = new_vcf.get_header().infos 6727 for keys, infos in new_vcf_header.items(): 6728 if keys not in header.infos.keys(): 6729 header.infos[keys] = infos 6730 log.debug(f"New header: {len(header.infos)} fields") 6731 log.debug(f"Splice tmp output: {output_vcf[0]}") 6732 self.update_from_vcf(output_vcf[0]) 6733 6734 # Remove file 6735 remove_if_exists(output_vcf)
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
6741 def get_config_default(self, name: str) -> dict: 6742 """ 6743 The function `get_config_default` returns a dictionary containing default configurations for 6744 various calculations and prioritizations. 6745 6746 :param name: The `get_config_default` function returns a dictionary containing default 6747 configurations for different calculations and prioritizations. The `name` parameter is used to 6748 specify which specific configuration to retrieve from the dictionary 6749 :type name: str 6750 :return: The function `get_config_default` returns a dictionary containing default configuration 6751 settings for different calculations and prioritizations. The specific configuration settings are 6752 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6753 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6754 returned. If there is no match, an empty dictionary is returned. 6755 """ 6756 6757 config_default = { 6758 "calculations": { 6759 "variant_chr_pos_alt_ref": { 6760 "type": "sql", 6761 "name": "variant_chr_pos_alt_ref", 6762 "description": "Create a variant ID with chromosome, position, alt and ref", 6763 "available": False, 6764 "output_column_name": "variant_chr_pos_alt_ref", 6765 "output_column_type": "String", 6766 "output_column_description": "variant ID with chromosome, position, alt and ref", 6767 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6768 "operation_info": True, 6769 }, 6770 "VARTYPE": { 6771 "type": "sql", 6772 "name": "VARTYPE", 6773 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6774 "available": True, 6775 "table": "variants", 6776 "output_column_name": "VARTYPE", 6777 "output_column_type": "String", 6778 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6779 "operation_query": """ 6780 CASE 6781 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6782 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6783 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6784 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6785 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6786 ELSE 'UNDEFINED' 6787 END 6788 """, 6789 "info_fields": ["SVTYPE"], 6790 "operation_info": True, 6791 }, 6792 "snpeff_hgvs": { 6793 "type": "python", 6794 "name": "snpeff_hgvs", 6795 "description": "HGVS nomenclatures from snpEff annotation", 6796 "available": True, 6797 "function_name": "calculation_extract_snpeff_hgvs", 6798 "function_params": ["snpeff_hgvs", "ANN"], 6799 }, 6800 "snpeff_ann_explode": { 6801 "type": "python", 6802 "name": "snpeff_ann_explode", 6803 "description": "Explode snpEff annotations with uniquify values", 6804 "available": True, 6805 "function_name": "calculation_snpeff_ann_explode", 6806 "function_params": [False, "fields", "snpeff_", "ANN"], 6807 }, 6808 "snpeff_ann_explode_uniquify": { 6809 "type": "python", 6810 "name": "snpeff_ann_explode_uniquify", 6811 "description": "Explode snpEff annotations", 6812 "available": True, 6813 "function_name": "calculation_snpeff_ann_explode", 6814 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6815 }, 6816 "snpeff_ann_explode_json": { 6817 "type": "python", 6818 "name": "snpeff_ann_explode_json", 6819 "description": "Explode snpEff annotations in JSON format", 6820 "available": True, 6821 "function_name": "calculation_snpeff_ann_explode", 6822 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6823 }, 6824 "NOMEN": { 6825 "type": "python", 6826 "name": "NOMEN", 6827 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6828 "available": True, 6829 "function_name": "calculation_extract_nomen", 6830 "function_params": [], 6831 }, 6832 "FINDBYPIPELINE": { 6833 "type": "python", 6834 "name": "FINDBYPIPELINE", 6835 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6836 "available": True, 6837 "function_name": "calculation_find_by_pipeline", 6838 "function_params": ["findbypipeline"], 6839 }, 6840 "FINDBYSAMPLE": { 6841 "type": "python", 6842 "name": "FINDBYSAMPLE", 6843 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6844 "available": True, 6845 "function_name": "calculation_find_by_pipeline", 6846 "function_params": ["findbysample"], 6847 }, 6848 "GENOTYPECONCORDANCE": { 6849 "type": "python", 6850 "name": "GENOTYPECONCORDANCE", 6851 "description": "Concordance of genotype for multi caller VCF", 6852 "available": True, 6853 "function_name": "calculation_genotype_concordance", 6854 "function_params": [], 6855 }, 6856 "BARCODE": { 6857 "type": "python", 6858 "name": "BARCODE", 6859 "description": "BARCODE as VaRank tool", 6860 "available": True, 6861 "function_name": "calculation_barcode", 6862 "function_params": [], 6863 }, 6864 "BARCODEFAMILY": { 6865 "type": "python", 6866 "name": "BARCODEFAMILY", 6867 "description": "BARCODEFAMILY as VaRank tool", 6868 "available": True, 6869 "function_name": "calculation_barcode_family", 6870 "function_params": ["BCF"], 6871 }, 6872 "TRIO": { 6873 "type": "python", 6874 "name": "TRIO", 6875 "description": "Inheritance for a trio family", 6876 "available": True, 6877 "function_name": "calculation_trio", 6878 "function_params": [], 6879 }, 6880 "VAF": { 6881 "type": "python", 6882 "name": "VAF", 6883 "description": "Variant Allele Frequency (VAF) harmonization", 6884 "available": True, 6885 "function_name": "calculation_vaf_normalization", 6886 "function_params": [], 6887 }, 6888 "VAF_stats": { 6889 "type": "python", 6890 "name": "VAF_stats", 6891 "description": "Variant Allele Frequency (VAF) statistics", 6892 "available": True, 6893 "function_name": "calculation_genotype_stats", 6894 "function_params": ["VAF"], 6895 }, 6896 "DP_stats": { 6897 "type": "python", 6898 "name": "DP_stats", 6899 "description": "Depth (DP) statistics", 6900 "available": True, 6901 "function_name": "calculation_genotype_stats", 6902 "function_params": ["DP"], 6903 }, 6904 "variant_id": { 6905 "type": "python", 6906 "name": "variant_id", 6907 "description": "Variant ID generated from variant position and type", 6908 "available": True, 6909 "function_name": "calculation_variant_id", 6910 "function_params": [], 6911 }, 6912 "transcripts_json": { 6913 "type": "python", 6914 "name": "transcripts_json", 6915 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6916 "available": True, 6917 "function_name": "calculation_transcripts_annotation", 6918 "function_params": ["transcripts_json", None], 6919 }, 6920 "transcripts_ann": { 6921 "type": "python", 6922 "name": "transcripts_ann", 6923 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6924 "available": True, 6925 "function_name": "calculation_transcripts_annotation", 6926 "function_params": [None, "transcripts_ann"], 6927 }, 6928 "transcripts_annotations": { 6929 "type": "python", 6930 "name": "transcripts_annotations", 6931 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6932 "available": True, 6933 "function_name": "calculation_transcripts_annotation", 6934 "function_params": [None, None], 6935 }, 6936 "transcripts_prioritization": { 6937 "type": "python", 6938 "name": "transcripts_prioritization", 6939 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6940 "available": True, 6941 "function_name": "calculation_transcripts_prioritization", 6942 "function_params": [], 6943 }, 6944 "transcripts_export": { 6945 "type": "python", 6946 "name": "transcripts_export", 6947 "description": "Export transcripts table/view as a file (using param.json)", 6948 "available": True, 6949 "function_name": "calculation_transcripts_export", 6950 "function_params": [], 6951 }, 6952 }, 6953 "prioritizations": { 6954 "default": { 6955 "ANN2": [ 6956 { 6957 "type": "contains", 6958 "value": "HIGH", 6959 "score": 5, 6960 "flag": "PASS", 6961 "comment": [ 6962 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6963 ], 6964 }, 6965 { 6966 "type": "contains", 6967 "value": "MODERATE", 6968 "score": 3, 6969 "flag": "PASS", 6970 "comment": [ 6971 "A non-disruptive variant that might change protein effectiveness" 6972 ], 6973 }, 6974 { 6975 "type": "contains", 6976 "value": "LOW", 6977 "score": 0, 6978 "flag": "FILTERED", 6979 "comment": [ 6980 "Assumed to be mostly harmless or unlikely to change protein behavior" 6981 ], 6982 }, 6983 { 6984 "type": "contains", 6985 "value": "MODIFIER", 6986 "score": 0, 6987 "flag": "FILTERED", 6988 "comment": [ 6989 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6990 ], 6991 }, 6992 ], 6993 } 6994 }, 6995 } 6996 6997 return config_default.get(name, None)
The function get_config_default returns a dictionary containing default configurations for
various calculations and prioritizations.
Parameters
- name: The
get_config_defaultfunction returns a dictionary containing default configurations for different calculations and prioritizations. Thenameparameter is used to specify which specific configuration to retrieve from the dictionary
Returns
The function
get_config_defaultreturns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the inputnameparameter provided to the function. If thenameparameter matches a key in theconfig_defaultdictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.
6999 def get_config_json( 7000 self, name: str, config_dict: dict = {}, config_file: str = None 7001 ) -> dict: 7002 """ 7003 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7004 default values, a dictionary, and a file. 7005 7006 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7007 the name of the configuration. It is used to identify and retrieve the configuration settings 7008 for a specific component or module 7009 :type name: str 7010 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7011 dictionary that allows you to provide additional configuration settings or overrides. When you 7012 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7013 the key is the configuration setting you want to override or 7014 :type config_dict: dict 7015 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7016 specify the path to a configuration file that contains additional settings. If provided, the 7017 function will read the contents of this file and update the configuration dictionary with the 7018 values found in the file, overriding any existing values with the 7019 :type config_file: str 7020 :return: The function `get_config_json` returns a dictionary containing the configuration 7021 settings. 7022 """ 7023 7024 # Create with default prioritizations 7025 config_default = self.get_config_default(name=name) 7026 configuration = config_default 7027 # log.debug(f"configuration={configuration}") 7028 7029 # Replace prioritizations from dict 7030 for config in config_dict: 7031 configuration[config] = config_dict[config] 7032 7033 # Replace prioritizations from file 7034 config_file = full_path(config_file) 7035 if config_file: 7036 if os.path.exists(config_file): 7037 with open(config_file) as config_file_content: 7038 config_file_dict = json.load(config_file_content) 7039 for config in config_file_dict: 7040 configuration[config] = config_file_dict[config] 7041 else: 7042 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7043 log.error(msg_error) 7044 raise ValueError(msg_error) 7045 7046 return configuration
The function get_config_json retrieves a configuration JSON object with prioritizations from
default values, a dictionary, and a file.
Parameters
- name: The
nameparameter in theget_config_jsonfunction is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module - config_dict: The
config_dictparameter in theget_config_jsonfunction is a dictionary that allows you to provide additional configuration settings or overrides. When you call theget_config_jsonfunction, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or - config_file: The
config_fileparameter in theget_config_jsonfunction is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns
The function
get_config_jsonreturns a dictionary containing the configuration settings.
7048 def prioritization( 7049 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7050 ) -> bool: 7051 """ 7052 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7053 prioritizes variants based on configured profiles and criteria. 7054 7055 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7056 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7057 a table name is provided, the method will prioritize the variants in that specific table 7058 :type table: str 7059 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7060 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7061 provided, the code will use a default prefix value of "PZ" 7062 :type pz_prefix: str 7063 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7064 additional parameters specific to the prioritization process. These parameters can include 7065 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7066 configurations needed for the prioritization of variants in a V 7067 :type pz_param: dict 7068 :return: A boolean value (True) is being returned from the `prioritization` function. 7069 """ 7070 7071 # Config 7072 config = self.get_config() 7073 7074 # Param 7075 param = self.get_param() 7076 7077 # Prioritization param 7078 if pz_param is not None: 7079 prioritization_param = pz_param 7080 else: 7081 prioritization_param = param.get("prioritization", {}) 7082 7083 # Configuration profiles 7084 prioritization_config_file = prioritization_param.get( 7085 "prioritization_config", None 7086 ) 7087 prioritization_config_file = full_path(prioritization_config_file) 7088 prioritizations_config = self.get_config_json( 7089 name="prioritizations", config_file=prioritization_config_file 7090 ) 7091 7092 # Prioritization prefix 7093 pz_prefix_default = "PZ" 7094 if pz_prefix is None: 7095 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7096 7097 # Prioritization options 7098 profiles = prioritization_param.get("profiles", []) 7099 if isinstance(profiles, str): 7100 profiles = profiles.split(",") 7101 pzfields = prioritization_param.get( 7102 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7103 ) 7104 if isinstance(pzfields, str): 7105 pzfields = pzfields.split(",") 7106 default_profile = prioritization_param.get("default_profile", None) 7107 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7108 prioritization_score_mode = prioritization_param.get( 7109 "prioritization_score_mode", "HOWARD" 7110 ) 7111 7112 # Quick Prioritizations 7113 prioritizations = param.get("prioritizations", None) 7114 if prioritizations: 7115 log.info("Quick Prioritization:") 7116 for profile in prioritizations.split(","): 7117 if profile not in profiles: 7118 profiles.append(profile) 7119 log.info(f" {profile}") 7120 7121 # If profile "ALL" provided, all profiles in the config profiles 7122 if "ALL" in profiles: 7123 profiles = list(prioritizations_config.keys()) 7124 7125 for profile in profiles: 7126 if prioritizations_config.get(profile, None): 7127 log.debug(f"Profile '{profile}' configured") 7128 else: 7129 msg_error = f"Profile '{profile}' NOT configured" 7130 log.error(msg_error) 7131 raise ValueError(msg_error) 7132 7133 if profiles: 7134 log.info(f"Prioritization... ") 7135 else: 7136 log.debug(f"No profile defined") 7137 return False 7138 7139 if not default_profile and len(profiles): 7140 default_profile = profiles[0] 7141 7142 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7143 log.debug("Profiles to check: " + str(list(profiles))) 7144 7145 # Variables 7146 if table is not None: 7147 table_variants = table 7148 else: 7149 table_variants = self.get_table_variants(clause="update") 7150 log.debug(f"Table to prioritize: {table_variants}") 7151 7152 # Added columns 7153 added_columns = [] 7154 7155 # Create list of PZfields 7156 # List of PZFields 7157 list_of_pzfields_original = pzfields + [ 7158 pzfield + pzfields_sep + profile 7159 for pzfield in pzfields 7160 for profile in profiles 7161 ] 7162 list_of_pzfields = [] 7163 log.debug(f"{list_of_pzfields_original}") 7164 7165 # Remove existing PZfields to use if exists 7166 for pzfield in list_of_pzfields_original: 7167 if self.get_header().infos.get(pzfield, None) is None: 7168 list_of_pzfields.append(pzfield) 7169 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7170 else: 7171 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7172 7173 if list_of_pzfields: 7174 7175 # Explode Infos prefix 7176 explode_infos_prefix = self.get_explode_infos_prefix() 7177 7178 # PZfields tags description 7179 PZfields_INFOS = { 7180 f"{pz_prefix}Tags": { 7181 "ID": f"{pz_prefix}Tags", 7182 "Number": ".", 7183 "Type": "String", 7184 "Description": "Variant tags based on annotation criteria", 7185 }, 7186 f"{pz_prefix}Score": { 7187 "ID": f"{pz_prefix}Score", 7188 "Number": 1, 7189 "Type": "Integer", 7190 "Description": "Variant score based on annotation criteria", 7191 }, 7192 f"{pz_prefix}Flag": { 7193 "ID": f"{pz_prefix}Flag", 7194 "Number": 1, 7195 "Type": "String", 7196 "Description": "Variant flag based on annotation criteria", 7197 }, 7198 f"{pz_prefix}Comment": { 7199 "ID": f"{pz_prefix}Comment", 7200 "Number": ".", 7201 "Type": "String", 7202 "Description": "Variant comment based on annotation criteria", 7203 }, 7204 f"{pz_prefix}Infos": { 7205 "ID": f"{pz_prefix}Infos", 7206 "Number": ".", 7207 "Type": "String", 7208 "Description": "Variant infos based on annotation criteria", 7209 }, 7210 f"{pz_prefix}Class": { 7211 "ID": f"{pz_prefix}Class", 7212 "Number": ".", 7213 "Type": "String", 7214 "Description": "Variant class based on annotation criteria", 7215 }, 7216 } 7217 7218 # Create INFO fields if not exist 7219 for field in PZfields_INFOS: 7220 field_ID = PZfields_INFOS[field]["ID"] 7221 field_description = PZfields_INFOS[field]["Description"] 7222 if field_ID not in self.get_header().infos and field_ID in pzfields: 7223 field_description = ( 7224 PZfields_INFOS[field]["Description"] 7225 + f", profile {default_profile}" 7226 ) 7227 self.get_header().infos[field_ID] = vcf.parser._Info( 7228 field_ID, 7229 PZfields_INFOS[field]["Number"], 7230 PZfields_INFOS[field]["Type"], 7231 field_description, 7232 "unknown", 7233 "unknown", 7234 code_type_map[PZfields_INFOS[field]["Type"]], 7235 ) 7236 7237 # Create INFO fields if not exist for each profile 7238 for profile in prioritizations_config: 7239 if profile in profiles or profiles == []: 7240 for field in PZfields_INFOS: 7241 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7242 field_description = ( 7243 PZfields_INFOS[field]["Description"] 7244 + f", profile {profile}" 7245 ) 7246 if ( 7247 field_ID not in self.get_header().infos 7248 and field in pzfields 7249 ): 7250 self.get_header().infos[field_ID] = vcf.parser._Info( 7251 field_ID, 7252 PZfields_INFOS[field]["Number"], 7253 PZfields_INFOS[field]["Type"], 7254 field_description, 7255 "unknown", 7256 "unknown", 7257 code_type_map[PZfields_INFOS[field]["Type"]], 7258 ) 7259 7260 # Header 7261 for pzfield in list_of_pzfields: 7262 if re.match(f"{pz_prefix}Score.*", pzfield): 7263 added_column = self.add_column( 7264 table_name=table_variants, 7265 column_name=pzfield, 7266 column_type="INTEGER", 7267 default_value="0", 7268 ) 7269 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7270 added_column = self.add_column( 7271 table_name=table_variants, 7272 column_name=pzfield, 7273 column_type="BOOLEAN", 7274 default_value="1", 7275 ) 7276 elif re.match(f"{pz_prefix}Class.*", pzfield): 7277 added_column = self.add_column( 7278 table_name=table_variants, 7279 column_name=pzfield, 7280 column_type="VARCHAR[]", 7281 default_value="null", 7282 ) 7283 else: 7284 added_column = self.add_column( 7285 table_name=table_variants, 7286 column_name=pzfield, 7287 column_type="STRING", 7288 default_value="''", 7289 ) 7290 added_columns.append(added_column) 7291 7292 # Profiles 7293 if profiles: 7294 7295 # foreach profile in configuration file 7296 for profile in prioritizations_config: 7297 7298 # If profile is asked in param, or ALL are asked (empty profile []) 7299 if profile in profiles or profiles == []: 7300 log.info(f"Profile '{profile}'") 7301 7302 sql_set_info_option = "" 7303 7304 sql_set_info = [] 7305 7306 # PZ fields set 7307 7308 # PZScore 7309 if ( 7310 f"{pz_prefix}Score{pzfields_sep}{profile}" 7311 in list_of_pzfields 7312 ): 7313 sql_set_info.append( 7314 f""" 7315 concat( 7316 '{pz_prefix}Score{pzfields_sep}{profile}=', 7317 {pz_prefix}Score{pzfields_sep}{profile} 7318 ) 7319 """ 7320 ) 7321 if ( 7322 profile == default_profile 7323 and f"{pz_prefix}Score" in list_of_pzfields 7324 ): 7325 sql_set_info.append( 7326 f""" 7327 concat( 7328 '{pz_prefix}Score=', 7329 {pz_prefix}Score{pzfields_sep}{profile} 7330 ) 7331 """ 7332 ) 7333 7334 # PZFlag 7335 if ( 7336 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7337 in list_of_pzfields 7338 ): 7339 sql_set_info.append( 7340 f""" 7341 concat( 7342 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7343 CASE 7344 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7345 THEN 'PASS' 7346 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7347 THEN 'FILTERED' 7348 END 7349 ) 7350 """ 7351 ) 7352 if ( 7353 profile == default_profile 7354 and f"{pz_prefix}Flag" in list_of_pzfields 7355 ): 7356 sql_set_info.append( 7357 f""" 7358 concat( 7359 '{pz_prefix}Flag=', 7360 CASE 7361 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7362 THEN 'PASS' 7363 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7364 THEN 'FILTERED' 7365 END 7366 ) 7367 """ 7368 ) 7369 7370 # PZClass 7371 if ( 7372 f"{pz_prefix}Class{pzfields_sep}{profile}" 7373 in list_of_pzfields 7374 ): 7375 sql_set_info.append( 7376 f""" 7377 concat( 7378 '{pz_prefix}Class{pzfields_sep}{profile}=', 7379 CASE 7380 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7381 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7382 ELSE '.' 7383 END 7384 ) 7385 7386 """ 7387 ) 7388 if ( 7389 profile == default_profile 7390 and f"{pz_prefix}Class" in list_of_pzfields 7391 ): 7392 sql_set_info.append( 7393 f""" 7394 concat( 7395 '{pz_prefix}Class=', 7396 CASE 7397 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7398 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7399 ELSE '.' 7400 END 7401 ) 7402 """ 7403 ) 7404 7405 # PZComment 7406 if ( 7407 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7408 in list_of_pzfields 7409 ): 7410 sql_set_info.append( 7411 f""" 7412 CASE 7413 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7414 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7415 ELSE '' 7416 END 7417 """ 7418 ) 7419 if ( 7420 profile == default_profile 7421 and f"{pz_prefix}Comment" in list_of_pzfields 7422 ): 7423 sql_set_info.append( 7424 f""" 7425 CASE 7426 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7427 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7428 ELSE '' 7429 END 7430 """ 7431 ) 7432 7433 # PZInfos 7434 if ( 7435 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7436 in list_of_pzfields 7437 ): 7438 sql_set_info.append( 7439 f""" 7440 CASE 7441 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7442 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7443 ELSE '' 7444 END 7445 """ 7446 ) 7447 if ( 7448 profile == default_profile 7449 and f"{pz_prefix}Infos" in list_of_pzfields 7450 ): 7451 sql_set_info.append( 7452 f""" 7453 CASE 7454 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7455 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7456 ELSE '' 7457 END 7458 """ 7459 ) 7460 7461 # Merge PZfields 7462 sql_set_info_option = "" 7463 sql_set_sep = "" 7464 for sql_set in sql_set_info: 7465 if sql_set_sep: 7466 sql_set_info_option += f""" 7467 , concat('{sql_set_sep}', {sql_set}) 7468 """ 7469 else: 7470 sql_set_info_option += f""" 7471 , {sql_set} 7472 """ 7473 sql_set_sep = ";" 7474 7475 sql_queries = [] 7476 for annotation in prioritizations_config[profile]: 7477 7478 # skip special sections 7479 if annotation.startswith("_"): 7480 continue 7481 7482 # For each criterions 7483 for criterion in prioritizations_config[profile][ 7484 annotation 7485 ]: 7486 7487 # Criterion mode 7488 criterion_mode = None 7489 if np.any( 7490 np.isin(list(criterion.keys()), ["type", "value"]) 7491 ): 7492 criterion_mode = "operation" 7493 elif np.any( 7494 np.isin(list(criterion.keys()), ["sql", "fields"]) 7495 ): 7496 criterion_mode = "sql" 7497 log.debug(f"Criterion Mode: {criterion_mode}") 7498 7499 # Criterion parameters 7500 criterion_type = criterion.get("type", None) 7501 criterion_value = criterion.get("value", None) 7502 criterion_sql = criterion.get("sql", None) 7503 criterion_fields = criterion.get("fields", None) 7504 criterion_score = criterion.get("score", 0) 7505 criterion_flag = criterion.get("flag", "PASS") 7506 criterion_class = criterion.get("class", None) 7507 criterion_flag_bool = criterion_flag == "PASS" 7508 criterion_comment = ( 7509 ", ".join(criterion.get("comment", [])) 7510 .replace("'", "''") 7511 .replace(";", ",") 7512 .replace("\t", " ") 7513 ) 7514 criterion_infos = ( 7515 str(criterion) 7516 .replace("'", "''") 7517 .replace(";", ",") 7518 .replace("\t", " ") 7519 ) 7520 7521 # SQL 7522 if criterion_sql is not None and isinstance( 7523 criterion_sql, list 7524 ): 7525 criterion_sql = " ".join(criterion_sql) 7526 7527 # Fields and explode 7528 if criterion_fields is None: 7529 criterion_fields = [annotation] 7530 if not isinstance(criterion_fields, list): 7531 criterion_fields = str(criterion_fields).split(",") 7532 7533 # Class 7534 if criterion_class is not None and not isinstance( 7535 criterion_class, list 7536 ): 7537 criterion_class = str(criterion_class).split(",") 7538 7539 for annotation_field in criterion_fields: 7540 7541 # Explode specific annotation 7542 log.debug( 7543 f"Explode annotation '{annotation_field}'" 7544 ) 7545 added_columns += self.explode_infos( 7546 prefix=explode_infos_prefix, 7547 fields=[annotation_field], 7548 table=table_variants, 7549 ) 7550 extra_infos = self.get_extra_infos( 7551 table=table_variants 7552 ) 7553 7554 # Check if annotation field is present 7555 if ( 7556 f"{explode_infos_prefix}{annotation_field}" 7557 not in extra_infos 7558 ): 7559 msq_err = f"Annotation '{annotation_field}' not in data" 7560 log.error(msq_err) 7561 raise ValueError(msq_err) 7562 else: 7563 log.debug( 7564 f"Annotation '{annotation_field}' in data" 7565 ) 7566 7567 sql_set = [] 7568 sql_set_info = [] 7569 7570 # PZ fields set 7571 7572 # PZScore 7573 if ( 7574 f"{pz_prefix}Score{pzfields_sep}{profile}" 7575 in list_of_pzfields 7576 ): 7577 # if prioritization_score_mode == "HOWARD": 7578 # sql_set.append( 7579 # f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7580 # ) 7581 # VaRank prioritization score mode 7582 if prioritization_score_mode == "VaRank": 7583 sql_set.append( 7584 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7585 ) 7586 # default HOWARD prioritization score mode 7587 else: 7588 sql_set.append( 7589 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7590 ) 7591 7592 # PZFlag 7593 if ( 7594 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7595 in list_of_pzfields 7596 ): 7597 sql_set.append( 7598 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7599 ) 7600 7601 # PZClass 7602 if ( 7603 f"{pz_prefix}Class{pzfields_sep}{profile}" 7604 in list_of_pzfields 7605 and criterion_class is not None 7606 ): 7607 sql_set.append( 7608 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7609 ) 7610 7611 # PZComment 7612 if ( 7613 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7614 in list_of_pzfields 7615 ): 7616 sql_set.append( 7617 f""" 7618 {pz_prefix}Comment{pzfields_sep}{profile} = 7619 concat( 7620 {pz_prefix}Comment{pzfields_sep}{profile}, 7621 CASE 7622 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7623 THEN ', ' 7624 ELSE '' 7625 END, 7626 '{criterion_comment}' 7627 ) 7628 """ 7629 ) 7630 7631 # PZInfos 7632 if ( 7633 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7634 in list_of_pzfields 7635 ): 7636 sql_set.append( 7637 f""" 7638 {pz_prefix}Infos{pzfields_sep}{profile} = 7639 concat( 7640 {pz_prefix}Infos{pzfields_sep}{profile}, 7641 '{criterion_infos}' 7642 ) 7643 """ 7644 ) 7645 sql_set_option = ",".join(sql_set) 7646 7647 # Criterion and comparison 7648 if sql_set_option: 7649 7650 if criterion_mode in ["operation"]: 7651 7652 try: 7653 float(criterion_value) 7654 sql_update = f""" 7655 UPDATE {table_variants} 7656 SET {sql_set_option} 7657 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7658 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7659 """ 7660 except: 7661 contains_option = "" 7662 if criterion_type == "contains": 7663 contains_option = ".*" 7664 sql_update = f""" 7665 UPDATE {table_variants} 7666 SET {sql_set_option} 7667 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7668 """ 7669 sql_queries.append(sql_update) 7670 7671 elif criterion_mode in ["sql"]: 7672 7673 sql_update = f""" 7674 UPDATE {table_variants} 7675 SET {sql_set_option} 7676 WHERE {criterion_sql} 7677 """ 7678 sql_queries.append(sql_update) 7679 7680 else: 7681 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7682 log.error(msg_err) 7683 raise ValueError(msg_err) 7684 7685 else: 7686 log.warning( 7687 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7688 ) 7689 7690 # PZTags 7691 if ( 7692 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7693 in list_of_pzfields 7694 ): 7695 7696 # Create PZFalgs value 7697 pztags_value = "" 7698 pztags_sep_default = "," 7699 pztags_sep = "" 7700 for pzfield in pzfields: 7701 if pzfield not in [f"{pz_prefix}Tags"]: 7702 if ( 7703 f"{pzfield}{pzfields_sep}{profile}" 7704 in list_of_pzfields 7705 ): 7706 if pzfield in [f"{pz_prefix}Flag"]: 7707 pztags_value += f"""{pztags_sep}{pzfield}#', 7708 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7709 THEN 'PASS' 7710 ELSE 'FILTERED' 7711 END, '""" 7712 elif pzfield in [f"{pz_prefix}Class"]: 7713 pztags_value += f"""{pztags_sep}{pzfield}#', 7714 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7715 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7716 ELSE '.' 7717 END, '""" 7718 else: 7719 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7720 pztags_sep = pztags_sep_default 7721 7722 # Add Query update for PZFlags 7723 sql_update_pztags = f""" 7724 UPDATE {table_variants} 7725 SET INFO = concat( 7726 INFO, 7727 CASE WHEN INFO NOT in ('','.') 7728 THEN ';' 7729 ELSE '' 7730 END, 7731 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7732 ) 7733 """ 7734 sql_queries.append(sql_update_pztags) 7735 7736 # Add Query update for PZFlags for default 7737 if profile == default_profile: 7738 sql_update_pztags_default = f""" 7739 UPDATE {table_variants} 7740 SET INFO = concat( 7741 INFO, 7742 ';', 7743 '{pz_prefix}Tags={pztags_value}' 7744 ) 7745 """ 7746 sql_queries.append(sql_update_pztags_default) 7747 7748 log.info(f"""Profile '{profile}' - Prioritization... """) 7749 7750 if sql_queries: 7751 7752 for sql_query in sql_queries: 7753 log.debug( 7754 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7755 ) 7756 self.conn.execute(sql_query) 7757 7758 log.info(f"""Profile '{profile}' - Update... """) 7759 sql_query_update = f""" 7760 UPDATE {table_variants} 7761 SET INFO = 7762 concat( 7763 CASE 7764 WHEN INFO NOT IN ('','.') 7765 THEN concat(INFO, ';') 7766 ELSE '' 7767 END 7768 {sql_set_info_option} 7769 ) 7770 """ 7771 self.conn.execute(sql_query_update) 7772 7773 else: 7774 7775 log.warning(f"No profiles in parameters") 7776 7777 # Remove added columns 7778 for added_column in added_columns: 7779 self.drop_column(column=added_column) 7780 7781 # Explode INFOS fields into table fields 7782 if self.get_explode_infos(): 7783 self.explode_infos( 7784 prefix=self.get_explode_infos_prefix(), 7785 fields=self.get_explode_infos_fields(), 7786 force=True, 7787 ) 7788 7789 return True
The prioritization function in Python processes VCF files, adds new INFO fields, and
prioritizes variants based on configured profiles and criteria.
Parameters
- table: The
tableparameter in theprioritizationfunction is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table - pz_prefix: The
pz_prefixparameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ" - pz_param: The
pz_paramparameter in theprioritizationmethod is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns
A boolean value (True) is being returned from the
prioritizationfunction.
7795 def annotation_hgvs(self, threads: int = None) -> None: 7796 """ 7797 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7798 coordinates and alleles. 7799 7800 :param threads: The `threads` parameter is an optional integer that specifies the number of 7801 threads to use for parallel processing. If no value is provided, it will default to the number 7802 of threads obtained from the `get_threads()` method 7803 :type threads: int 7804 """ 7805 7806 # Function for each partition of the Dask Dataframe 7807 def partition_function(partition): 7808 """ 7809 The function `partition_function` applies the `annotation_hgvs_partition` function to 7810 each row of a DataFrame called `partition`. 7811 7812 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7813 to be processed 7814 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7815 the "partition" dataframe along the axis 1. 7816 """ 7817 return partition.apply(annotation_hgvs_partition, axis=1) 7818 7819 def annotation_hgvs_partition(row) -> str: 7820 """ 7821 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7822 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7823 7824 :param row: A dictionary-like object that contains the values for the following keys: 7825 :return: a string that contains the HGVS names associated with the given row of data. 7826 """ 7827 7828 chr = row["CHROM"] 7829 pos = row["POS"] 7830 ref = row["REF"] 7831 alt = row["ALT"] 7832 7833 # Find list of associated transcripts 7834 transcripts_list = list( 7835 polars_conn.execute( 7836 f""" 7837 SELECT transcript 7838 FROM refseq_df 7839 WHERE CHROM='{chr}' 7840 AND POS={pos} 7841 """ 7842 )["transcript"] 7843 ) 7844 7845 # Full HGVS annotation in list 7846 hgvs_full_list = [] 7847 7848 for transcript_name in transcripts_list: 7849 7850 # Transcript 7851 transcript = get_transcript( 7852 transcripts=transcripts, transcript_name=transcript_name 7853 ) 7854 # Exon 7855 if use_exon: 7856 exon = transcript.find_exon_number(pos) 7857 else: 7858 exon = None 7859 # Protein 7860 transcript_protein = None 7861 if use_protein or add_protein or full_format: 7862 transcripts_protein = list( 7863 polars_conn.execute( 7864 f""" 7865 SELECT protein 7866 FROM refseqlink_df 7867 WHERE transcript='{transcript_name}' 7868 LIMIT 1 7869 """ 7870 )["protein"] 7871 ) 7872 if len(transcripts_protein): 7873 transcript_protein = transcripts_protein[0] 7874 7875 # HGVS name 7876 hgvs_name = format_hgvs_name( 7877 chr, 7878 pos, 7879 ref, 7880 alt, 7881 genome=genome, 7882 transcript=transcript, 7883 transcript_protein=transcript_protein, 7884 exon=exon, 7885 use_gene=use_gene, 7886 use_protein=use_protein, 7887 full_format=full_format, 7888 use_version=use_version, 7889 codon_type=codon_type, 7890 ) 7891 hgvs_full_list.append(hgvs_name) 7892 if add_protein and not use_protein and not full_format: 7893 hgvs_name = format_hgvs_name( 7894 chr, 7895 pos, 7896 ref, 7897 alt, 7898 genome=genome, 7899 transcript=transcript, 7900 transcript_protein=transcript_protein, 7901 exon=exon, 7902 use_gene=use_gene, 7903 use_protein=True, 7904 full_format=False, 7905 use_version=use_version, 7906 codon_type=codon_type, 7907 ) 7908 hgvs_full_list.append(hgvs_name) 7909 7910 # Create liste of HGVS annotations 7911 hgvs_full = ",".join(hgvs_full_list) 7912 7913 return hgvs_full 7914 7915 # Polars connexion 7916 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7917 7918 # Config 7919 config = self.get_config() 7920 7921 # Databases 7922 # Genome 7923 databases_genomes_folders = ( 7924 config.get("folders", {}) 7925 .get("databases", {}) 7926 .get("genomes", DEFAULT_GENOME_FOLDER) 7927 ) 7928 databases_genome = ( 7929 config.get("folders", {}).get("databases", {}).get("genomes", "") 7930 ) 7931 # refseq database folder 7932 databases_refseq_folders = ( 7933 config.get("folders", {}) 7934 .get("databases", {}) 7935 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7936 ) 7937 # refseq 7938 databases_refseq = config.get("databases", {}).get("refSeq", None) 7939 # refSeqLink 7940 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7941 7942 # Param 7943 param = self.get_param() 7944 7945 # Quick HGVS 7946 if "hgvs_options" in param and param.get("hgvs_options", ""): 7947 log.info(f"Quick HGVS Annotation:") 7948 if not param.get("hgvs", None): 7949 param["hgvs"] = {} 7950 for option in param.get("hgvs_options", "").split(","): 7951 option_var_val = option.split("=") 7952 option_var = option_var_val[0] 7953 if len(option_var_val) > 1: 7954 option_val = option_var_val[1] 7955 else: 7956 option_val = "True" 7957 if option_val.upper() in ["TRUE"]: 7958 option_val = True 7959 elif option_val.upper() in ["FALSE"]: 7960 option_val = False 7961 log.info(f" {option_var}={option_val}") 7962 param["hgvs"][option_var] = option_val 7963 7964 # Check if HGVS annotation enabled 7965 if "hgvs" in param: 7966 log.info(f"HGVS Annotation... ") 7967 for hgvs_option in param.get("hgvs", {}): 7968 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7969 else: 7970 return 7971 7972 # HGVS Param 7973 param_hgvs = param.get("hgvs", {}) 7974 use_exon = param_hgvs.get("use_exon", False) 7975 use_gene = param_hgvs.get("use_gene", False) 7976 use_protein = param_hgvs.get("use_protein", False) 7977 add_protein = param_hgvs.get("add_protein", False) 7978 full_format = param_hgvs.get("full_format", False) 7979 use_version = param_hgvs.get("use_version", False) 7980 codon_type = param_hgvs.get("codon_type", "3") 7981 7982 # refSseq refSeqLink 7983 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7984 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7985 7986 # Assembly 7987 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7988 7989 # Genome 7990 genome_file = None 7991 if find_genome(databases_genome): 7992 genome_file = find_genome(databases_genome) 7993 else: 7994 genome_file = find_genome( 7995 genome_path=databases_genomes_folders, assembly=assembly 7996 ) 7997 log.debug("Genome: " + str(genome_file)) 7998 7999 # refSseq 8000 refseq_file = find_file_prefix( 8001 input_file=databases_refseq, 8002 prefix="ncbiRefSeq", 8003 folder=databases_refseq_folders, 8004 assembly=assembly, 8005 ) 8006 log.debug("refSeq: " + str(refseq_file)) 8007 8008 # refSeqLink 8009 refseqlink_file = find_file_prefix( 8010 input_file=databases_refseqlink, 8011 prefix="ncbiRefSeqLink", 8012 folder=databases_refseq_folders, 8013 assembly=assembly, 8014 ) 8015 log.debug("refSeqLink: " + str(refseqlink_file)) 8016 8017 # Threads 8018 if not threads: 8019 threads = self.get_threads() 8020 log.debug("Threads: " + str(threads)) 8021 8022 # Variables 8023 table_variants = self.get_table_variants(clause="update") 8024 8025 # Get variants SNV and InDel only 8026 query_variants = f""" 8027 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8028 FROM {table_variants} 8029 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8030 """ 8031 df_variants = self.get_query_to_df(query_variants) 8032 8033 # Added columns 8034 added_columns = [] 8035 8036 # Add hgvs column in variants table 8037 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8038 added_column = self.add_column( 8039 table_variants, hgvs_column_name, "STRING", default_value=None 8040 ) 8041 added_columns.append(added_column) 8042 8043 log.debug(f"refSeq loading...") 8044 # refSeq in duckDB 8045 refseq_table = get_refseq_table( 8046 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8047 ) 8048 # Loading all refSeq in Dataframe 8049 refseq_query = f""" 8050 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8051 FROM {refseq_table} 8052 JOIN df_variants ON ( 8053 {refseq_table}.chrom = df_variants.CHROM 8054 AND {refseq_table}.txStart<=df_variants.POS 8055 AND {refseq_table}.txEnd>=df_variants.POS 8056 ) 8057 """ 8058 refseq_df = self.conn.query(refseq_query).pl() 8059 8060 if refseqlink_file: 8061 log.debug(f"refSeqLink loading...") 8062 # refSeqLink in duckDB 8063 refseqlink_table = get_refseq_table( 8064 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8065 ) 8066 # Loading all refSeqLink in Dataframe 8067 protacc_column = "protAcc_with_ver" 8068 mrnaacc_column = "mrnaAcc_with_ver" 8069 refseqlink_query = f""" 8070 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8071 FROM {refseqlink_table} 8072 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8073 WHERE protAcc_without_ver IS NOT NULL 8074 """ 8075 # Polars Dataframe 8076 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8077 8078 # Read RefSeq transcripts into a python dict/model. 8079 log.debug(f"Transcripts loading...") 8080 with tempfile.TemporaryDirectory() as tmpdir: 8081 transcripts_query = f""" 8082 COPY ( 8083 SELECT {refseq_table}.* 8084 FROM {refseq_table} 8085 JOIN df_variants ON ( 8086 {refseq_table}.chrom=df_variants.CHROM 8087 AND {refseq_table}.txStart<=df_variants.POS 8088 AND {refseq_table}.txEnd>=df_variants.POS 8089 ) 8090 ) 8091 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8092 """ 8093 self.conn.query(transcripts_query) 8094 with open(f"{tmpdir}/transcript.tsv") as infile: 8095 transcripts = read_transcripts(infile) 8096 8097 # Polars connexion 8098 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8099 8100 log.debug("Genome loading...") 8101 # Read genome sequence using pyfaidx. 8102 genome = Fasta(genome_file) 8103 8104 log.debug("Start annotation HGVS...") 8105 8106 # Create 8107 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8108 ddf = dd.from_pandas(df_variants, npartitions=threads) 8109 8110 # Use dask.dataframe.apply() to apply function on each partition 8111 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8112 8113 # Convert Dask DataFrame to Pandas Dataframe 8114 df = ddf.compute() 8115 8116 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8117 with tempfile.TemporaryDirectory() as tmpdir: 8118 df_parquet = os.path.join(tmpdir, "df.parquet") 8119 df.to_parquet(df_parquet) 8120 8121 # Update hgvs column 8122 update_variant_query = f""" 8123 UPDATE {table_variants} 8124 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8125 FROM read_parquet('{df_parquet}') as df 8126 WHERE variants."#CHROM" = df.CHROM 8127 AND variants.POS = df.POS 8128 AND variants.REF = df.REF 8129 AND variants.ALT = df.ALT 8130 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8131 """ 8132 self.execute_query(update_variant_query) 8133 8134 # Update INFO column 8135 sql_query_update = f""" 8136 UPDATE {table_variants} 8137 SET INFO = 8138 concat( 8139 CASE 8140 WHEN INFO NOT IN ('','.') 8141 THEN concat(INFO, ';') 8142 ELSE '' 8143 END, 8144 'hgvs=', 8145 {hgvs_column_name} 8146 ) 8147 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8148 """ 8149 self.execute_query(sql_query_update) 8150 8151 # Add header 8152 HGVS_INFOS = { 8153 "hgvs": { 8154 "ID": "hgvs", 8155 "Number": ".", 8156 "Type": "String", 8157 "Description": f"HGVS annotatation with HOWARD", 8158 } 8159 } 8160 8161 for field in HGVS_INFOS: 8162 field_ID = HGVS_INFOS[field]["ID"] 8163 field_description = HGVS_INFOS[field]["Description"] 8164 self.get_header().infos[field_ID] = vcf.parser._Info( 8165 field_ID, 8166 HGVS_INFOS[field]["Number"], 8167 HGVS_INFOS[field]["Type"], 8168 field_description, 8169 "unknown", 8170 "unknown", 8171 code_type_map[HGVS_INFOS[field]["Type"]], 8172 ) 8173 8174 # Remove added columns 8175 for added_column in added_columns: 8176 self.drop_column(column=added_column)
The annotation_hgvs function performs HGVS annotation on a set of variants using genomic
coordinates and alleles.
Parameters
- threads: The
threadsparameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from theget_threads()method
8182 def get_operations_help( 8183 self, operations_config_dict: dict = {}, operations_config_file: str = None 8184 ) -> list: 8185 8186 # Init 8187 operations_help = [] 8188 8189 # operations 8190 operations = self.get_config_json( 8191 name="calculations", 8192 config_dict=operations_config_dict, 8193 config_file=operations_config_file, 8194 ) 8195 for op in operations: 8196 op_name = operations[op].get("name", op).upper() 8197 op_description = operations[op].get("description", op_name) 8198 op_available = operations[op].get("available", False) 8199 if op_available: 8200 operations_help.append(f" {op_name}: {op_description}") 8201 8202 # Sort operations 8203 operations_help.sort() 8204 8205 # insert header 8206 operations_help.insert(0, "Available calculation operations:") 8207 8208 # Return 8209 return operations_help
8211 def calculation( 8212 self, 8213 operations: dict = {}, 8214 operations_config_dict: dict = {}, 8215 operations_config_file: str = None, 8216 ) -> None: 8217 """ 8218 It takes a list of operations, and for each operation, it checks if it's a python or sql 8219 operation, and then calls the appropriate function 8220 8221 param json example: 8222 "calculation": { 8223 "NOMEN": { 8224 "options": { 8225 "hgvs_field": "hgvs" 8226 }, 8227 "middle" : null 8228 } 8229 """ 8230 8231 # Param 8232 param = self.get_param() 8233 8234 # operations config 8235 operations_config = self.get_config_json( 8236 name="calculations", 8237 config_dict=operations_config_dict, 8238 config_file=operations_config_file, 8239 ) 8240 8241 # Upper keys 8242 operations_config = {k.upper(): v for k, v in operations_config.items()} 8243 8244 # Calculations 8245 8246 # Operations from param 8247 operations = param.get("calculation", {}).get("calculations", operations) 8248 8249 # Quick calculation - add 8250 if param.get("calculations", None): 8251 8252 # List of operations 8253 calculations_list = [ 8254 value.strip() for value in param.get("calculations", "").split(",") 8255 ] 8256 8257 # Log 8258 log.info(f"Quick Calculations:") 8259 for calculation_key in calculations_list: 8260 log.info(f" {calculation_key}") 8261 8262 # Create tmp operations (to keep operation order) 8263 operations_tmp = {} 8264 for calculation_operation in calculations_list: 8265 if calculation_operation.upper() not in operations_tmp: 8266 log.debug( 8267 f"{calculation_operation}.upper() not in {operations_tmp}" 8268 ) 8269 operations_tmp[calculation_operation.upper()] = {} 8270 add_value_into_dict( 8271 dict_tree=operations_tmp, 8272 sections=[ 8273 calculation_operation.upper(), 8274 ], 8275 value=operations.get(calculation_operation.upper(), {}), 8276 ) 8277 # Add operations already in param 8278 for calculation_operation in operations: 8279 if calculation_operation not in operations_tmp: 8280 operations_tmp[calculation_operation] = operations.get( 8281 calculation_operation, {} 8282 ) 8283 8284 # Update operations in param 8285 operations = operations_tmp 8286 8287 # Operations for calculation 8288 if not operations: 8289 operations = param.get("calculation", {}).get("calculations", {}) 8290 8291 if operations: 8292 log.info(f"Calculations...") 8293 8294 # For each operations 8295 for operation_name in operations: 8296 operation_name = operation_name.upper() 8297 if operation_name not in [""]: 8298 if operation_name in operations_config: 8299 log.info(f"Calculation '{operation_name}'") 8300 operation = operations_config[operation_name] 8301 operation_type = operation.get("type", "sql") 8302 if operation_type == "python": 8303 self.calculation_process_function( 8304 operation=operation, operation_name=operation_name 8305 ) 8306 elif operation_type == "sql": 8307 self.calculation_process_sql( 8308 operation=operation, operation_name=operation_name 8309 ) 8310 else: 8311 log.error( 8312 f"Operations config: Type '{operation_type}' NOT available" 8313 ) 8314 raise ValueError( 8315 f"Operations config: Type '{operation_type}' NOT available" 8316 ) 8317 else: 8318 log.error( 8319 f"Operations config: Calculation '{operation_name}' NOT available" 8320 ) 8321 raise ValueError( 8322 f"Operations config: Calculation '{operation_name}' NOT available" 8323 ) 8324 8325 # Explode INFOS fields into table fields 8326 if self.get_explode_infos(): 8327 self.explode_infos( 8328 prefix=self.get_explode_infos_prefix(), 8329 fields=self.get_explode_infos_fields(), 8330 force=True, 8331 )
It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function
param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }
8333 def calculation_process_sql( 8334 self, operation: dict, operation_name: str = "unknown" 8335 ) -> None: 8336 """ 8337 The `calculation_process_sql` function takes in a mathematical operation as a string and 8338 performs the operation, updating the specified table with the result. 8339 8340 :param operation: The `operation` parameter is a dictionary that contains information about the 8341 mathematical operation to be performed. It includes the following keys: 8342 :type operation: dict 8343 :param operation_name: The `operation_name` parameter is a string that represents the name of 8344 the mathematical operation being performed. It is used for logging and error handling purposes, 8345 defaults to unknown 8346 :type operation_name: str (optional) 8347 """ 8348 8349 # Operation infos 8350 operation_name = operation.get("name", "unknown") 8351 log.debug(f"process sql {operation_name}") 8352 output_column_name = operation.get("output_column_name", operation_name) 8353 output_column_type = operation.get("output_column_type", "String") 8354 prefix = operation.get("explode_infos_prefix", "") 8355 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8356 output_column_description = operation.get( 8357 "output_column_description", f"{operation_name} operation" 8358 ) 8359 operation_query = operation.get("operation_query", None) 8360 if isinstance(operation_query, list): 8361 operation_query = " ".join(operation_query) 8362 operation_info_fields = operation.get("info_fields", []) 8363 operation_info_fields_check = operation.get("info_fields_check", False) 8364 operation_info = operation.get("operation_info", True) 8365 operation_table = operation.get( 8366 "table", self.get_table_variants(clause="alter") 8367 ) 8368 8369 # table variants 8370 if operation_table: 8371 table_variants = operation_table 8372 else: 8373 table_variants = self.get_table_variants(clause="alter") 8374 8375 if operation_query: 8376 8377 # Info fields check 8378 operation_info_fields_check_result = True 8379 if operation_info_fields_check: 8380 header_infos = self.get_header().infos 8381 for info_field in operation_info_fields: 8382 operation_info_fields_check_result = ( 8383 operation_info_fields_check_result 8384 and info_field in header_infos 8385 ) 8386 8387 # If info fields available 8388 if operation_info_fields_check_result: 8389 8390 # Added_columns 8391 added_columns = [] 8392 8393 # Create VCF header field 8394 vcf_reader = self.get_header() 8395 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8396 output_column_name, 8397 ".", 8398 output_column_type, 8399 output_column_description, 8400 "howard calculation", 8401 "0", 8402 self.code_type_map.get(output_column_type), 8403 ) 8404 8405 # Explode infos if needed 8406 log.debug(f"calculation_process_sql prefix {prefix}") 8407 added_columns += self.explode_infos( 8408 prefix=prefix, 8409 fields=[output_column_name] + operation_info_fields, 8410 force=False, 8411 table=table_variants, 8412 ) 8413 8414 # Create column 8415 added_column = self.add_column( 8416 table_name=table_variants, 8417 column_name=prefix + output_column_name, 8418 column_type=output_column_type_sql, 8419 default_value="null", 8420 ) 8421 added_columns.append(added_column) 8422 8423 # Operation calculation 8424 try: 8425 8426 # Query to update calculation column 8427 sql_update = f""" 8428 UPDATE {table_variants} 8429 SET "{prefix}{output_column_name}" = ({operation_query}) 8430 """ 8431 self.conn.execute(sql_update) 8432 8433 # Add to INFO 8434 if operation_info: 8435 sql_update_info = f""" 8436 UPDATE {table_variants} 8437 SET "INFO" = 8438 concat( 8439 CASE 8440 WHEN "INFO" IS NOT NULL 8441 THEN concat("INFO", ';') 8442 ELSE '' 8443 END, 8444 '{output_column_name}=', 8445 "{prefix}{output_column_name}" 8446 ) 8447 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8448 """ 8449 self.conn.execute(sql_update_info) 8450 8451 except: 8452 log.error( 8453 f"Operations config: Calculation '{operation_name}' query failed" 8454 ) 8455 raise ValueError( 8456 f"Operations config: Calculation '{operation_name}' query failed" 8457 ) 8458 8459 # Remove added columns 8460 for added_column in added_columns: 8461 log.debug(f"added_column: {added_column}") 8462 self.drop_column(column=added_column) 8463 8464 else: 8465 log.error( 8466 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8467 ) 8468 raise ValueError( 8469 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8470 ) 8471 8472 else: 8473 log.error( 8474 f"Operations config: Calculation '{operation_name}' query NOT defined" 8475 ) 8476 raise ValueError( 8477 f"Operations config: Calculation '{operation_name}' query NOT defined" 8478 )
The calculation_process_sql function takes in a mathematical operation as a string and
performs the operation, updating the specified table with the result.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
8480 def calculation_process_function( 8481 self, operation: dict, operation_name: str = "unknown" 8482 ) -> None: 8483 """ 8484 The `calculation_process_function` takes in an operation dictionary and performs the specified 8485 function with the given parameters. 8486 8487 :param operation: The `operation` parameter is a dictionary that contains information about the 8488 operation to be performed. It has the following keys: 8489 :type operation: dict 8490 :param operation_name: The `operation_name` parameter is a string that represents the name of 8491 the operation being performed. It is used for logging purposes, defaults to unknown 8492 :type operation_name: str (optional) 8493 """ 8494 8495 operation_name = operation["name"] 8496 log.debug(f"process sql {operation_name}") 8497 function_name = operation["function_name"] 8498 function_params = operation["function_params"] 8499 getattr(self, function_name)(*function_params)
The calculation_process_function takes in an operation dictionary and performs the specified
function with the given parameters.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the operation to be performed. It has the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
8501 def calculation_variant_id(self) -> None: 8502 """ 8503 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8504 updates the INFO field of a variants table with the variant ID. 8505 """ 8506 8507 # variant_id annotation field 8508 variant_id_tag = self.get_variant_id_column() 8509 added_columns = [variant_id_tag] 8510 8511 # variant_id hgvs tags" 8512 vcf_infos_tags = { 8513 variant_id_tag: "howard variant ID annotation", 8514 } 8515 8516 # Variants table 8517 table_variants = self.get_table_variants() 8518 8519 # Header 8520 vcf_reader = self.get_header() 8521 8522 # Add variant_id to header 8523 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8524 variant_id_tag, 8525 ".", 8526 "String", 8527 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8528 "howard calculation", 8529 "0", 8530 self.code_type_map.get("String"), 8531 ) 8532 8533 # Update 8534 sql_update = f""" 8535 UPDATE {table_variants} 8536 SET "INFO" = 8537 concat( 8538 CASE 8539 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8540 THEN '' 8541 ELSE concat("INFO", ';') 8542 END, 8543 '{variant_id_tag}=', 8544 "{variant_id_tag}" 8545 ) 8546 """ 8547 self.conn.execute(sql_update) 8548 8549 # Remove added columns 8550 for added_column in added_columns: 8551 self.drop_column(column=added_column)
The function calculation_variant_id adds a variant ID annotation to a VCF file header and
updates the INFO field of a variants table with the variant ID.
8553 def calculation_extract_snpeff_hgvs( 8554 self, 8555 snpeff_hgvs: str = "snpeff_hgvs", 8556 snpeff_field: str = "ANN", 8557 ) -> None: 8558 """ 8559 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8560 annotation field in a VCF file and adds them as a new column in the variants table. 8561 8562 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8563 function is used to specify the name of the column that will store the HGVS nomenclatures 8564 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8565 snpeff_hgvs 8566 :type snpeff_hgvs: str (optional) 8567 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8568 function represents the field in the VCF file that contains SnpEff annotations. This field is 8569 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8570 to ANN 8571 :type snpeff_field: str (optional) 8572 """ 8573 8574 # Snpeff hgvs tags 8575 vcf_infos_tags = { 8576 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8577 } 8578 8579 # Prefix 8580 prefix = self.get_explode_infos_prefix() 8581 if prefix: 8582 prefix = "INFO/" 8583 8584 # snpEff fields 8585 speff_ann_infos = prefix + snpeff_field 8586 speff_hgvs_infos = prefix + snpeff_hgvs 8587 8588 # Variants table 8589 table_variants = self.get_table_variants() 8590 8591 # Header 8592 vcf_reader = self.get_header() 8593 8594 # Add columns 8595 added_columns = [] 8596 8597 # Explode HGVS field in column 8598 added_columns += self.explode_infos(fields=[snpeff_field]) 8599 8600 if snpeff_field in vcf_reader.infos: 8601 8602 log.debug(vcf_reader.infos[snpeff_field]) 8603 8604 # Extract ANN header 8605 ann_description = vcf_reader.infos[snpeff_field].desc 8606 pattern = r"'(.+?)'" 8607 match = re.search(pattern, ann_description) 8608 if match: 8609 ann_header_match = match.group(1).split(" | ") 8610 ann_header_desc = {} 8611 for i in range(len(ann_header_match)): 8612 ann_header_info = "".join( 8613 char for char in ann_header_match[i] if char.isalnum() 8614 ) 8615 ann_header_desc[ann_header_info] = ann_header_match[i] 8616 if not ann_header_desc: 8617 raise ValueError("Invalid header description format") 8618 else: 8619 raise ValueError("Invalid header description format") 8620 8621 # Create variant id 8622 variant_id_column = self.get_variant_id_column() 8623 added_columns += [variant_id_column] 8624 8625 # Create dataframe 8626 dataframe_snpeff_hgvs = self.get_query_to_df( 8627 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8628 ) 8629 8630 # Create main NOMEN column 8631 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8632 speff_ann_infos 8633 ].apply( 8634 lambda x: extract_snpeff_hgvs( 8635 str(x), header=list(ann_header_desc.values()) 8636 ) 8637 ) 8638 8639 # Add snpeff_hgvs to header 8640 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8641 snpeff_hgvs, 8642 ".", 8643 "String", 8644 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8645 "howard calculation", 8646 "0", 8647 self.code_type_map.get("String"), 8648 ) 8649 8650 # Update 8651 sql_update = f""" 8652 UPDATE variants 8653 SET "INFO" = 8654 concat( 8655 CASE 8656 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8657 THEN '' 8658 ELSE concat("INFO", ';') 8659 END, 8660 CASE 8661 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8662 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8663 THEN concat( 8664 '{snpeff_hgvs}=', 8665 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8666 ) 8667 ELSE '' 8668 END 8669 ) 8670 FROM dataframe_snpeff_hgvs 8671 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8672 8673 """ 8674 self.conn.execute(sql_update) 8675 8676 # Delete dataframe 8677 del dataframe_snpeff_hgvs 8678 gc.collect() 8679 8680 else: 8681 8682 log.warning( 8683 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8684 ) 8685 8686 # Remove added columns 8687 for added_column in added_columns: 8688 self.drop_column(column=added_column)
The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff
annotation field in a VCF file and adds them as a new column in the variants table.
Parameters
- snpeff_hgvs: The
snpeff_hgvsparameter in thecalculation_extract_snpeff_hgvsfunction is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs - snpeff_field: The
snpeff_fieldparameter in thecalculation_extract_snpeff_hgvsfunction represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
8690 def calculation_snpeff_ann_explode( 8691 self, 8692 uniquify: bool = True, 8693 output_format: str = "fields", 8694 output_prefix: str = "snpeff_", 8695 snpeff_field: str = "ANN", 8696 ) -> None: 8697 """ 8698 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8699 exploding the HGVS field and updating variant information accordingly. 8700 8701 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8702 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8703 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8704 defaults to True 8705 :type uniquify: bool (optional) 8706 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8707 function specifies the format in which the output annotations will be generated. It has a 8708 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8709 format, defaults to fields 8710 :type output_format: str (optional) 8711 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8712 method is used to specify the prefix that will be added to the output annotations generated 8713 during the calculation process. This prefix helps to differentiate the newly added annotations 8714 from existing ones in the output data. By default, the, defaults to ANN_ 8715 :type output_prefix: str (optional) 8716 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8717 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8718 field will be processed to explode the HGVS annotations and update the variant information 8719 accordingly, defaults to ANN 8720 :type snpeff_field: str (optional) 8721 """ 8722 8723 # SnpEff annotation field 8724 snpeff_hgvs = "snpeff_ann_explode" 8725 8726 # Snpeff hgvs tags 8727 vcf_infos_tags = { 8728 snpeff_hgvs: "Explode snpEff annotations", 8729 } 8730 8731 # Prefix 8732 prefix = self.get_explode_infos_prefix() 8733 if prefix: 8734 prefix = "INFO/" 8735 8736 # snpEff fields 8737 speff_ann_infos = prefix + snpeff_field 8738 speff_hgvs_infos = prefix + snpeff_hgvs 8739 8740 # Variants table 8741 table_variants = self.get_table_variants() 8742 8743 # Header 8744 vcf_reader = self.get_header() 8745 8746 # Add columns 8747 added_columns = [] 8748 8749 # Explode HGVS field in column 8750 added_columns += self.explode_infos(fields=[snpeff_field]) 8751 log.debug(f"snpeff_field={snpeff_field}") 8752 log.debug(f"added_columns={added_columns}") 8753 8754 if snpeff_field in vcf_reader.infos: 8755 8756 # Extract ANN header 8757 ann_description = vcf_reader.infos[snpeff_field].desc 8758 pattern = r"'(.+?)'" 8759 match = re.search(pattern, ann_description) 8760 if match: 8761 ann_header_match = match.group(1).split(" | ") 8762 ann_header = [] 8763 ann_header_desc = {} 8764 for i in range(len(ann_header_match)): 8765 ann_header_info = "".join( 8766 char for char in ann_header_match[i] if char.isalnum() 8767 ) 8768 ann_header.append(ann_header_info) 8769 ann_header_desc[ann_header_info] = ann_header_match[i] 8770 if not ann_header_desc: 8771 raise ValueError("Invalid header description format") 8772 else: 8773 raise ValueError("Invalid header description format") 8774 8775 # Create variant id 8776 variant_id_column = self.get_variant_id_column() 8777 added_columns += [variant_id_column] 8778 8779 # Create dataframe 8780 dataframe_snpeff_hgvs = self.get_query_to_df( 8781 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8782 ) 8783 8784 # Create snpEff columns 8785 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8786 speff_ann_infos 8787 ].apply( 8788 lambda x: explode_snpeff_ann( 8789 str(x), 8790 uniquify=uniquify, 8791 output_format=output_format, 8792 prefix=output_prefix, 8793 header=list(ann_header_desc.values()), 8794 ) 8795 ) 8796 8797 # Header 8798 ann_annotations_prefix = "" 8799 if output_format.upper() in ["JSON"]: 8800 ann_annotations_prefix = f"{output_prefix}=" 8801 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8802 output_prefix, 8803 ".", 8804 "String", 8805 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8806 + " - JSON format", 8807 "howard calculation", 8808 "0", 8809 self.code_type_map.get("String"), 8810 ) 8811 else: 8812 for ann_annotation in ann_header: 8813 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8814 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8815 ann_annotation_id, 8816 ".", 8817 "String", 8818 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8819 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8820 "howard calculation", 8821 "0", 8822 self.code_type_map.get("String"), 8823 ) 8824 8825 # Update 8826 sql_update = f""" 8827 UPDATE variants 8828 SET "INFO" = 8829 concat( 8830 CASE 8831 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8832 THEN '' 8833 ELSE concat("INFO", ';') 8834 END, 8835 CASE 8836 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8837 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8838 THEN concat( 8839 '{ann_annotations_prefix}', 8840 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8841 ) 8842 ELSE '' 8843 END 8844 ) 8845 FROM dataframe_snpeff_hgvs 8846 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8847 8848 """ 8849 self.conn.execute(sql_update) 8850 8851 # Delete dataframe 8852 del dataframe_snpeff_hgvs 8853 gc.collect() 8854 8855 else: 8856 8857 log.warning( 8858 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8859 ) 8860 8861 # Remove added columns 8862 for added_column in added_columns: 8863 self.drop_column(column=added_column)
The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by
exploding the HGVS field and updating variant information accordingly.
Parameters
- uniquify: The
uniquifyparameter in thecalculation_snpeff_ann_explodemethod is a boolean flag that determines whether the output should be uniquified or not. When set toTrue, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True - output_format: The
output_formatparameter in thecalculation_snpeff_ann_explodefunction specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields - output_prefix: The
output_prefixparameter in thecalculation_snpeff_ann_explodemethod is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_ - snpeff_field: The
snpeff_fieldparameter in thecalculation_snpeff_ann_explodefunction is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
8865 def calculation_extract_nomen(self) -> None: 8866 """ 8867 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8868 """ 8869 8870 # NOMEN field 8871 field_nomen_dict = "NOMEN_DICT" 8872 8873 # NOMEN structure 8874 nomen_dict = { 8875 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8876 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8877 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8878 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8879 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8880 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8881 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8882 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8883 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8884 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8885 } 8886 8887 # Param 8888 param = self.get_param() 8889 8890 # Prefix 8891 prefix = self.get_explode_infos_prefix() 8892 8893 # Header 8894 vcf_reader = self.get_header() 8895 8896 # Added columns 8897 added_columns = [] 8898 8899 # Get HGVS field 8900 hgvs_field = ( 8901 param.get("calculation", {}) 8902 .get("calculations", {}) 8903 .get("NOMEN", {}) 8904 .get("options", {}) 8905 .get("hgvs_field", "hgvs") 8906 ) 8907 8908 # Get NOMEN pattern 8909 nomen_pattern = ( 8910 param.get("calculation", {}) 8911 .get("calculations", {}) 8912 .get("NOMEN", {}) 8913 .get("options", {}) 8914 .get("pattern", None) 8915 ) 8916 8917 # transcripts list of preference sources 8918 transcripts_sources = {} 8919 8920 # Get transcripts 8921 transcripts_file = ( 8922 param.get("calculation", {}) 8923 .get("calculations", {}) 8924 .get("NOMEN", {}) 8925 .get("options", {}) 8926 .get("transcripts", None) 8927 ) 8928 transcripts_file = full_path(transcripts_file) 8929 if transcripts_file: 8930 if os.path.exists(transcripts_file): 8931 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8932 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8933 transcripts_sources["file"] = transcripts_from_file 8934 else: 8935 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8936 log.error(msg_err) 8937 raise ValueError(msg_err) 8938 8939 # Get transcripts table 8940 transcripts_table = ( 8941 param.get("calculation", {}) 8942 .get("calculations", {}) 8943 .get("NOMEN", {}) 8944 .get("options", {}) 8945 .get("transcripts_table", self.get_table_variants()) 8946 ) 8947 # Get transcripts column 8948 transcripts_column = ( 8949 param.get("calculation", {}) 8950 .get("calculations", {}) 8951 .get("NOMEN", {}) 8952 .get("options", {}) 8953 .get("transcripts_column", None) 8954 ) 8955 8956 if transcripts_table and transcripts_column: 8957 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8958 # Explode if not exists 8959 self.explode_infos(fields=[transcripts_column], table=transcripts_table) 8960 else: 8961 extra_field_transcript = f"NULL" 8962 8963 # Transcripts of preference source order 8964 transcripts_order = ( 8965 param.get("calculation", {}) 8966 .get("calculations", {}) 8967 .get("NOMEN", {}) 8968 .get("options", {}) 8969 .get("transcripts_order", ["column", "file"]) 8970 ) 8971 8972 # Transcripts from file 8973 transcripts = transcripts_sources.get("file", []) 8974 8975 # Explode HGVS field in column 8976 added_columns += self.explode_infos(fields=[hgvs_field]) 8977 8978 # extra infos 8979 extra_infos = self.get_extra_infos() 8980 extra_field = prefix + hgvs_field 8981 8982 if extra_field in extra_infos: 8983 8984 # Create dataframe 8985 dataframe_hgvs = self.get_query_to_df( 8986 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 8987 ) 8988 8989 # Create main NOMEN column 8990 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 8991 lambda x: find_nomen( 8992 hgvs=x.hgvs, 8993 transcript=x.transcript, 8994 transcripts=transcripts, 8995 pattern=nomen_pattern, 8996 transcripts_source_order=transcripts_order, 8997 ), 8998 axis=1, 8999 ) 9000 9001 # Explode NOMEN Structure and create SQL set for update 9002 sql_nomen_fields = [] 9003 for nomen_field in nomen_dict: 9004 9005 # Explode each field into a column 9006 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 9007 lambda x: dict(x).get(nomen_field, "") 9008 ) 9009 9010 # Create VCF header field 9011 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9012 nomen_field, 9013 ".", 9014 "String", 9015 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9016 "howard calculation", 9017 "0", 9018 self.code_type_map.get("String"), 9019 ) 9020 sql_nomen_fields.append( 9021 f""" 9022 CASE 9023 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 9024 THEN concat( 9025 ';{nomen_field}=', 9026 dataframe_hgvs."{nomen_field}" 9027 ) 9028 ELSE '' 9029 END 9030 """ 9031 ) 9032 9033 # SQL set for update 9034 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9035 9036 # Update 9037 sql_update = f""" 9038 UPDATE variants 9039 SET "INFO" = 9040 concat( 9041 CASE 9042 WHEN "INFO" IS NULL 9043 THEN '' 9044 ELSE "INFO" 9045 END, 9046 {sql_nomen_fields_set} 9047 ) 9048 FROM dataframe_hgvs 9049 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9050 AND variants."POS" = dataframe_hgvs."POS" 9051 AND variants."REF" = dataframe_hgvs."REF" 9052 AND variants."ALT" = dataframe_hgvs."ALT" 9053 """ 9054 self.conn.execute(sql_update) 9055 9056 # Delete dataframe 9057 del dataframe_hgvs 9058 gc.collect() 9059 9060 # Remove added columns 9061 for added_column in added_columns: 9062 self.drop_column(column=added_column)
This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
9064 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9065 """ 9066 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9067 pipeline/sample for a variant and updates the variant information in a VCF file. 9068 9069 :param tag: The `tag` parameter is a string that represents the annotation field for the 9070 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9071 VCF header and to update the corresponding field in the variants table, defaults to 9072 findbypipeline 9073 :type tag: str (optional) 9074 """ 9075 9076 # if FORMAT and samples 9077 if ( 9078 "FORMAT" in self.get_header_columns_as_list() 9079 and self.get_header_sample_list() 9080 ): 9081 9082 # findbypipeline annotation field 9083 findbypipeline_tag = tag 9084 9085 # VCF infos tags 9086 vcf_infos_tags = { 9087 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9088 } 9089 9090 # Prefix 9091 prefix = self.get_explode_infos_prefix() 9092 9093 # Field 9094 findbypipeline_infos = prefix + findbypipeline_tag 9095 9096 # Variants table 9097 table_variants = self.get_table_variants() 9098 9099 # Header 9100 vcf_reader = self.get_header() 9101 9102 # Create variant id 9103 variant_id_column = self.get_variant_id_column() 9104 added_columns = [variant_id_column] 9105 9106 # variant_id, FORMAT and samples 9107 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9108 self.get_header_sample_list() 9109 ) 9110 9111 # Create dataframe 9112 dataframe_findbypipeline = self.get_query_to_df( 9113 f""" SELECT {samples_fields} FROM {table_variants} """ 9114 ) 9115 9116 # Create findbypipeline column 9117 dataframe_findbypipeline[findbypipeline_infos] = ( 9118 dataframe_findbypipeline.apply( 9119 lambda row: findbypipeline( 9120 row, samples=self.get_header_sample_list() 9121 ), 9122 axis=1, 9123 ) 9124 ) 9125 9126 # Add snpeff_hgvs to header 9127 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9128 findbypipeline_tag, 9129 ".", 9130 "String", 9131 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9132 "howard calculation", 9133 "0", 9134 self.code_type_map.get("String"), 9135 ) 9136 9137 # Update 9138 sql_update = f""" 9139 UPDATE variants 9140 SET "INFO" = 9141 concat( 9142 CASE 9143 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9144 THEN '' 9145 ELSE concat("INFO", ';') 9146 END, 9147 CASE 9148 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9149 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9150 THEN concat( 9151 '{findbypipeline_tag}=', 9152 dataframe_findbypipeline."{findbypipeline_infos}" 9153 ) 9154 ELSE '' 9155 END 9156 ) 9157 FROM dataframe_findbypipeline 9158 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9159 """ 9160 self.conn.execute(sql_update) 9161 9162 # Remove added columns 9163 for added_column in added_columns: 9164 self.drop_column(column=added_column) 9165 9166 # Delete dataframe 9167 del dataframe_findbypipeline 9168 gc.collect()
The function calculation_find_by_pipeline performs a calculation to find the number of
pipeline/sample for a variant and updates the variant information in a VCF file.
Parameters
- tag: The
tagparameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
9170 def calculation_genotype_concordance(self) -> None: 9171 """ 9172 The function `calculation_genotype_concordance` calculates the genotype concordance for 9173 multi-caller VCF files and updates the variant information in the database. 9174 """ 9175 9176 # if FORMAT and samples 9177 if ( 9178 "FORMAT" in self.get_header_columns_as_list() 9179 and self.get_header_sample_list() 9180 ): 9181 9182 # genotypeconcordance annotation field 9183 genotypeconcordance_tag = "genotypeconcordance" 9184 9185 # VCF infos tags 9186 vcf_infos_tags = { 9187 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9188 } 9189 9190 # Prefix 9191 prefix = self.get_explode_infos_prefix() 9192 9193 # Field 9194 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9195 9196 # Variants table 9197 table_variants = self.get_table_variants() 9198 9199 # Header 9200 vcf_reader = self.get_header() 9201 9202 # Create variant id 9203 variant_id_column = self.get_variant_id_column() 9204 added_columns = [variant_id_column] 9205 9206 # variant_id, FORMAT and samples 9207 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9208 self.get_header_sample_list() 9209 ) 9210 9211 # Create dataframe 9212 dataframe_genotypeconcordance = self.get_query_to_df( 9213 f""" SELECT {samples_fields} FROM {table_variants} """ 9214 ) 9215 9216 # Create genotypeconcordance column 9217 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9218 dataframe_genotypeconcordance.apply( 9219 lambda row: genotypeconcordance( 9220 row, samples=self.get_header_sample_list() 9221 ), 9222 axis=1, 9223 ) 9224 ) 9225 9226 # Add genotypeconcordance to header 9227 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9228 genotypeconcordance_tag, 9229 ".", 9230 "String", 9231 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9232 "howard calculation", 9233 "0", 9234 self.code_type_map.get("String"), 9235 ) 9236 9237 # Update 9238 sql_update = f""" 9239 UPDATE variants 9240 SET "INFO" = 9241 concat( 9242 CASE 9243 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9244 THEN '' 9245 ELSE concat("INFO", ';') 9246 END, 9247 CASE 9248 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9249 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9250 THEN concat( 9251 '{genotypeconcordance_tag}=', 9252 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9253 ) 9254 ELSE '' 9255 END 9256 ) 9257 FROM dataframe_genotypeconcordance 9258 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9259 """ 9260 self.conn.execute(sql_update) 9261 9262 # Remove added columns 9263 for added_column in added_columns: 9264 self.drop_column(column=added_column) 9265 9266 # Delete dataframe 9267 del dataframe_genotypeconcordance 9268 gc.collect()
The function calculation_genotype_concordance calculates the genotype concordance for
multi-caller VCF files and updates the variant information in the database.
9270 def calculation_barcode(self, tag: str = "barcode") -> None: 9271 """ 9272 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9273 updates the INFO field in the file with the calculated barcode values. 9274 9275 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9276 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9277 the default tag name is set to "barcode", defaults to barcode 9278 :type tag: str (optional) 9279 """ 9280 9281 # if FORMAT and samples 9282 if ( 9283 "FORMAT" in self.get_header_columns_as_list() 9284 and self.get_header_sample_list() 9285 ): 9286 9287 # barcode annotation field 9288 if not tag: 9289 tag = "barcode" 9290 9291 # VCF infos tags 9292 vcf_infos_tags = { 9293 tag: "barcode calculation (VaRank)", 9294 } 9295 9296 # Prefix 9297 prefix = self.get_explode_infos_prefix() 9298 9299 # Field 9300 barcode_infos = prefix + tag 9301 9302 # Variants table 9303 table_variants = self.get_table_variants() 9304 9305 # Header 9306 vcf_reader = self.get_header() 9307 9308 # Create variant id 9309 variant_id_column = self.get_variant_id_column() 9310 added_columns = [variant_id_column] 9311 9312 # variant_id, FORMAT and samples 9313 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9314 self.get_header_sample_list() 9315 ) 9316 9317 # Create dataframe 9318 dataframe_barcode = self.get_query_to_df( 9319 f""" SELECT {samples_fields} FROM {table_variants} """ 9320 ) 9321 9322 # Create barcode column 9323 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9324 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9325 ) 9326 9327 # Add barcode to header 9328 vcf_reader.infos[tag] = vcf.parser._Info( 9329 tag, 9330 ".", 9331 "String", 9332 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9333 "howard calculation", 9334 "0", 9335 self.code_type_map.get("String"), 9336 ) 9337 9338 # Update 9339 sql_update = f""" 9340 UPDATE {table_variants} 9341 SET "INFO" = 9342 concat( 9343 CASE 9344 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9345 THEN '' 9346 ELSE concat("INFO", ';') 9347 END, 9348 CASE 9349 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9350 AND dataframe_barcode."{barcode_infos}" NOT NULL 9351 THEN concat( 9352 '{tag}=', 9353 dataframe_barcode."{barcode_infos}" 9354 ) 9355 ELSE '' 9356 END 9357 ) 9358 FROM dataframe_barcode 9359 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9360 """ 9361 self.conn.execute(sql_update) 9362 9363 # Remove added columns 9364 for added_column in added_columns: 9365 self.drop_column(column=added_column) 9366 9367 # Delete dataframe 9368 del dataframe_barcode 9369 gc.collect()
The calculation_barcode function calculates barcode values for variants in a VCF file and
updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcodefunction is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
9371 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9372 """ 9373 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9374 and updates the INFO field in the file with the calculated barcode values. 9375 9376 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9377 the barcode tag that will be added to the VCF file during the calculation process. If no value 9378 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9379 :type tag: str (optional) 9380 """ 9381 9382 # if FORMAT and samples 9383 if ( 9384 "FORMAT" in self.get_header_columns_as_list() 9385 and self.get_header_sample_list() 9386 ): 9387 9388 # barcode annotation field 9389 if not tag: 9390 tag = "BCF" 9391 9392 # VCF infos tags 9393 vcf_infos_tags = { 9394 tag: "barcode family calculation", 9395 f"{tag}S": "barcode family samples", 9396 } 9397 9398 # Param 9399 param = self.get_param() 9400 log.debug(f"param={param}") 9401 9402 # Prefix 9403 prefix = self.get_explode_infos_prefix() 9404 9405 # PED param 9406 ped = ( 9407 param.get("calculation", {}) 9408 .get("calculations", {}) 9409 .get("BARCODEFAMILY", {}) 9410 .get("family_pedigree", None) 9411 ) 9412 log.debug(f"ped={ped}") 9413 9414 # Load PED 9415 if ped: 9416 9417 # Pedigree is a file 9418 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9419 log.debug("Pedigree is file") 9420 with open(full_path(ped)) as ped: 9421 ped = json.load(ped) 9422 9423 # Pedigree is a string 9424 elif isinstance(ped, str): 9425 log.debug("Pedigree is str") 9426 try: 9427 ped = json.loads(ped) 9428 log.debug("Pedigree is json str") 9429 except ValueError as e: 9430 ped_samples = ped.split(",") 9431 ped = {} 9432 for ped_sample in ped_samples: 9433 ped[ped_sample] = ped_sample 9434 9435 # Pedigree is a dict 9436 elif isinstance(ped, dict): 9437 log.debug("Pedigree is dict") 9438 9439 # Pedigree is not well formatted 9440 else: 9441 msg_error = "Pedigree not well formatted" 9442 log.error(msg_error) 9443 raise ValueError(msg_error) 9444 9445 # Construct list 9446 ped_samples = list(ped.values()) 9447 9448 else: 9449 log.debug("Pedigree not defined. Take all samples") 9450 ped_samples = self.get_header_sample_list() 9451 ped = {} 9452 for ped_sample in ped_samples: 9453 ped[ped_sample] = ped_sample 9454 9455 # Check pedigree 9456 if not ped or len(ped) == 0: 9457 msg_error = f"Error in pedigree: samples {ped_samples}" 9458 log.error(msg_error) 9459 raise ValueError(msg_error) 9460 9461 # Log 9462 log.info( 9463 "Calculation 'BARCODEFAMILY' - Samples: " 9464 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9465 ) 9466 log.debug(f"ped_samples={ped_samples}") 9467 9468 # Field 9469 barcode_infos = prefix + tag 9470 9471 # Variants table 9472 table_variants = self.get_table_variants() 9473 9474 # Header 9475 vcf_reader = self.get_header() 9476 9477 # Create variant id 9478 variant_id_column = self.get_variant_id_column() 9479 added_columns = [variant_id_column] 9480 9481 # variant_id, FORMAT and samples 9482 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9483 ped_samples 9484 ) 9485 9486 # Create dataframe 9487 dataframe_barcode = self.get_query_to_df( 9488 f""" SELECT {samples_fields} FROM {table_variants} """ 9489 ) 9490 9491 # Create barcode column 9492 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9493 lambda row: barcode(row, samples=ped_samples), axis=1 9494 ) 9495 9496 # Add barcode family to header 9497 # Add vaf_normalization to header 9498 vcf_reader.formats[tag] = vcf.parser._Format( 9499 id=tag, 9500 num=".", 9501 type="String", 9502 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9503 type_code=self.code_type_map.get("String"), 9504 ) 9505 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9506 id=f"{tag}S", 9507 num=".", 9508 type="String", 9509 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9510 type_code=self.code_type_map.get("String"), 9511 ) 9512 9513 # Update 9514 # for sample in ped_samples: 9515 sql_update_set = [] 9516 for sample in self.get_header_sample_list() + ["FORMAT"]: 9517 if sample in ped_samples: 9518 value = f'dataframe_barcode."{barcode_infos}"' 9519 value_samples = "'" + ",".join(ped_samples) + "'" 9520 elif sample == "FORMAT": 9521 value = f"'{tag}'" 9522 value_samples = f"'{tag}S'" 9523 else: 9524 value = "'.'" 9525 value_samples = "'.'" 9526 format_regex = r"[a-zA-Z0-9\s]" 9527 sql_update_set.append( 9528 f""" 9529 "{sample}" = 9530 concat( 9531 CASE 9532 WHEN {table_variants}."{sample}" = './.' 9533 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9534 ELSE {table_variants}."{sample}" 9535 END, 9536 ':', 9537 {value}, 9538 ':', 9539 {value_samples} 9540 ) 9541 """ 9542 ) 9543 9544 sql_update_set_join = ", ".join(sql_update_set) 9545 sql_update = f""" 9546 UPDATE {table_variants} 9547 SET {sql_update_set_join} 9548 FROM dataframe_barcode 9549 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9550 """ 9551 self.conn.execute(sql_update) 9552 9553 # Remove added columns 9554 for added_column in added_columns: 9555 self.drop_column(column=added_column) 9556 9557 # Delete dataframe 9558 del dataframe_barcode 9559 gc.collect()
The calculation_barcode_family function calculates barcode values for variants in a VCF file
and updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcode_familyfunction is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for thetagparameter, the default value used is "BCF", defaults to BCF
9561 def calculation_trio(self) -> None: 9562 """ 9563 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9564 information to the INFO field of each variant. 9565 """ 9566 9567 # if FORMAT and samples 9568 if ( 9569 "FORMAT" in self.get_header_columns_as_list() 9570 and self.get_header_sample_list() 9571 ): 9572 9573 # trio annotation field 9574 trio_tag = "trio" 9575 9576 # VCF infos tags 9577 vcf_infos_tags = { 9578 "trio": "trio calculation", 9579 } 9580 9581 # Param 9582 param = self.get_param() 9583 9584 # Prefix 9585 prefix = self.get_explode_infos_prefix() 9586 9587 # Trio param 9588 trio_ped = ( 9589 param.get("calculation", {}) 9590 .get("calculations", {}) 9591 .get("TRIO", {}) 9592 .get("trio_pedigree", None) 9593 ) 9594 9595 # Load trio 9596 if trio_ped: 9597 9598 # Trio pedigree is a file 9599 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9600 log.debug("TRIO pedigree is file") 9601 with open(full_path(trio_ped)) as trio_ped: 9602 trio_ped = json.load(trio_ped) 9603 9604 # Trio pedigree is a string 9605 elif isinstance(trio_ped, str): 9606 log.debug("TRIO pedigree is str") 9607 try: 9608 trio_ped = json.loads(trio_ped) 9609 log.debug("TRIO pedigree is json str") 9610 except ValueError as e: 9611 trio_samples = trio_ped.split(",") 9612 if len(trio_samples) == 3: 9613 trio_ped = { 9614 "father": trio_samples[0], 9615 "mother": trio_samples[1], 9616 "child": trio_samples[2], 9617 } 9618 log.debug("TRIO pedigree is list str") 9619 else: 9620 msg_error = "TRIO pedigree not well formatted" 9621 log.error(msg_error) 9622 raise ValueError(msg_error) 9623 9624 # Trio pedigree is a dict 9625 elif isinstance(trio_ped, dict): 9626 log.debug("TRIO pedigree is dict") 9627 9628 # Trio pedigree is not well formatted 9629 else: 9630 msg_error = "TRIO pedigree not well formatted" 9631 log.error(msg_error) 9632 raise ValueError(msg_error) 9633 9634 # Construct trio list 9635 trio_samples = [ 9636 trio_ped.get("father", ""), 9637 trio_ped.get("mother", ""), 9638 trio_ped.get("child", ""), 9639 ] 9640 9641 else: 9642 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9643 samples_list = self.get_header_sample_list() 9644 if len(samples_list) >= 3: 9645 trio_samples = self.get_header_sample_list()[0:3] 9646 trio_ped = { 9647 "father": trio_samples[0], 9648 "mother": trio_samples[1], 9649 "child": trio_samples[2], 9650 } 9651 else: 9652 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9653 log.error(msg_error) 9654 raise ValueError(msg_error) 9655 9656 # Check trio pedigree 9657 if not trio_ped or len(trio_ped) != 3: 9658 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9659 log.error(msg_error) 9660 raise ValueError(msg_error) 9661 9662 # Log 9663 log.info( 9664 f"Calculation 'TRIO' - Samples: " 9665 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9666 ) 9667 9668 # Field 9669 trio_infos = prefix + trio_tag 9670 9671 # Variants table 9672 table_variants = self.get_table_variants() 9673 9674 # Header 9675 vcf_reader = self.get_header() 9676 9677 # Create variant id 9678 variant_id_column = self.get_variant_id_column() 9679 added_columns = [variant_id_column] 9680 9681 # variant_id, FORMAT and samples 9682 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9683 self.get_header_sample_list() 9684 ) 9685 9686 # Create dataframe 9687 dataframe_trio = self.get_query_to_df( 9688 f""" SELECT {samples_fields} FROM {table_variants} """ 9689 ) 9690 9691 # Create trio column 9692 dataframe_trio[trio_infos] = dataframe_trio.apply( 9693 lambda row: trio(row, samples=trio_samples), axis=1 9694 ) 9695 9696 # Add trio to header 9697 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9698 trio_tag, 9699 ".", 9700 "String", 9701 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9702 "howard calculation", 9703 "0", 9704 self.code_type_map.get("String"), 9705 ) 9706 9707 # Update 9708 sql_update = f""" 9709 UPDATE {table_variants} 9710 SET "INFO" = 9711 concat( 9712 CASE 9713 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9714 THEN '' 9715 ELSE concat("INFO", ';') 9716 END, 9717 CASE 9718 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9719 AND dataframe_trio."{trio_infos}" NOT NULL 9720 THEN concat( 9721 '{trio_tag}=', 9722 dataframe_trio."{trio_infos}" 9723 ) 9724 ELSE '' 9725 END 9726 ) 9727 FROM dataframe_trio 9728 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9729 """ 9730 self.conn.execute(sql_update) 9731 9732 # Remove added columns 9733 for added_column in added_columns: 9734 self.drop_column(column=added_column) 9735 9736 # Delete dataframe 9737 del dataframe_trio 9738 gc.collect()
The calculation_trio function performs trio calculations on a VCF file by adding trio
information to the INFO field of each variant.
9740 def calculation_vaf_normalization(self) -> None: 9741 """ 9742 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9743 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9744 :return: The function does not return anything. 9745 """ 9746 9747 # if FORMAT and samples 9748 if ( 9749 "FORMAT" in self.get_header_columns_as_list() 9750 and self.get_header_sample_list() 9751 ): 9752 9753 # vaf_normalization annotation field 9754 vaf_normalization_tag = "VAF" 9755 9756 # VCF infos tags 9757 vcf_infos_tags = { 9758 "VAF": "VAF Variant Frequency", 9759 } 9760 9761 # Prefix 9762 prefix = self.get_explode_infos_prefix() 9763 9764 # Variants table 9765 table_variants = self.get_table_variants() 9766 9767 # Header 9768 vcf_reader = self.get_header() 9769 9770 # Do not calculate if VAF already exists 9771 if "VAF" in vcf_reader.formats: 9772 log.debug("VAF already on genotypes") 9773 return 9774 9775 # Create variant id 9776 variant_id_column = self.get_variant_id_column() 9777 added_columns = [variant_id_column] 9778 9779 # variant_id, FORMAT and samples 9780 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9781 f""" "{sample}" """ for sample in self.get_header_sample_list() 9782 ) 9783 9784 # Create dataframe 9785 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9786 log.debug(f"query={query}") 9787 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9788 9789 vaf_normalization_set = [] 9790 9791 # for each sample vaf_normalization 9792 for sample in self.get_header_sample_list(): 9793 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9794 lambda row: vaf_normalization(row, sample=sample), axis=1 9795 ) 9796 vaf_normalization_set.append( 9797 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9798 ) 9799 9800 # Add VAF to FORMAT 9801 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9802 "FORMAT" 9803 ].apply(lambda x: str(x) + ":VAF") 9804 vaf_normalization_set.append( 9805 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9806 ) 9807 9808 # Add vaf_normalization to header 9809 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9810 id=vaf_normalization_tag, 9811 num="1", 9812 type="Float", 9813 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9814 type_code=self.code_type_map.get("Float"), 9815 ) 9816 9817 # Create fields to add in INFO 9818 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9819 9820 # Update 9821 sql_update = f""" 9822 UPDATE {table_variants} 9823 SET {sql_vaf_normalization_set} 9824 FROM dataframe_vaf_normalization 9825 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9826 9827 """ 9828 self.conn.execute(sql_update) 9829 9830 # Remove added columns 9831 for added_column in added_columns: 9832 self.drop_column(column=added_column) 9833 9834 # Delete dataframe 9835 del dataframe_vaf_normalization 9836 gc.collect()
The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency)
normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
Returns
The function does not return anything.
9838 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9839 """ 9840 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9841 field in a VCF file and updates the INFO column of the variants table with the calculated 9842 statistics. 9843 9844 :param info: The `info` parameter is a string that represents the type of information for which 9845 genotype statistics are calculated. It is used to generate various VCF info tags for the 9846 statistics, such as the number of occurrences, the list of values, the minimum value, the 9847 maximum value, the mean, the median, defaults to VAF 9848 :type info: str (optional) 9849 """ 9850 9851 # if FORMAT and samples 9852 if ( 9853 "FORMAT" in self.get_header_columns_as_list() 9854 and self.get_header_sample_list() 9855 ): 9856 9857 # vaf_stats annotation field 9858 vaf_stats_tag = info + "_stats" 9859 9860 # VCF infos tags 9861 vcf_infos_tags = { 9862 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9863 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9864 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9865 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9866 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9867 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9868 info 9869 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9870 } 9871 9872 # Prefix 9873 prefix = self.get_explode_infos_prefix() 9874 9875 # Field 9876 vaf_stats_infos = prefix + vaf_stats_tag 9877 9878 # Variants table 9879 table_variants = self.get_table_variants() 9880 9881 # Header 9882 vcf_reader = self.get_header() 9883 9884 # Create variant id 9885 variant_id_column = self.get_variant_id_column() 9886 added_columns = [variant_id_column] 9887 9888 # variant_id, FORMAT and samples 9889 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9890 self.get_header_sample_list() 9891 ) 9892 9893 # Create dataframe 9894 dataframe_vaf_stats = self.get_query_to_df( 9895 f""" SELECT {samples_fields} FROM {table_variants} """ 9896 ) 9897 9898 # Create vaf_stats column 9899 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9900 lambda row: genotype_stats( 9901 row, samples=self.get_header_sample_list(), info=info 9902 ), 9903 axis=1, 9904 ) 9905 9906 # List of vcf tags 9907 sql_vaf_stats_fields = [] 9908 9909 # Check all VAF stats infos 9910 for stat in vcf_infos_tags: 9911 9912 # Extract stats 9913 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9914 lambda x: dict(x).get(stat, "") 9915 ) 9916 9917 # Add snpeff_hgvs to header 9918 vcf_reader.infos[stat] = vcf.parser._Info( 9919 stat, 9920 ".", 9921 "String", 9922 vcf_infos_tags.get(stat, "genotype statistics"), 9923 "howard calculation", 9924 "0", 9925 self.code_type_map.get("String"), 9926 ) 9927 9928 if len(sql_vaf_stats_fields): 9929 sep = ";" 9930 else: 9931 sep = "" 9932 9933 # Create fields to add in INFO 9934 sql_vaf_stats_fields.append( 9935 f""" 9936 CASE 9937 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9938 THEN concat( 9939 '{sep}{stat}=', 9940 dataframe_vaf_stats."{stat}" 9941 ) 9942 ELSE '' 9943 END 9944 """ 9945 ) 9946 9947 # SQL set for update 9948 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9949 9950 # Update 9951 sql_update = f""" 9952 UPDATE {table_variants} 9953 SET "INFO" = 9954 concat( 9955 CASE 9956 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9957 THEN '' 9958 ELSE concat("INFO", ';') 9959 END, 9960 {sql_vaf_stats_fields_set} 9961 ) 9962 FROM dataframe_vaf_stats 9963 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9964 9965 """ 9966 self.conn.execute(sql_update) 9967 9968 # Remove added columns 9969 for added_column in added_columns: 9970 self.drop_column(column=added_column) 9971 9972 # Delete dataframe 9973 del dataframe_vaf_stats 9974 gc.collect()
The calculation_genotype_stats function calculates genotype statistics for a given information
field in a VCF file and updates the INFO column of the variants table with the calculated
statistics.
Parameters
- info: The
infoparameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
9976 def calculation_transcripts_annotation( 9977 self, info_json: str = None, info_format: str = None 9978 ) -> None: 9979 """ 9980 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9981 field to it if transcripts are available. 9982 9983 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9984 is a string parameter that represents the information field to be used in the transcripts JSON. 9985 It is used to specify the JSON format for the transcripts information. If no value is provided 9986 when calling the method, it defaults to " 9987 :type info_json: str 9988 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9989 method is a string parameter that specifies the format of the information field to be used in 9990 the transcripts JSON. It is used to define the format of the information field 9991 :type info_format: str 9992 """ 9993 9994 # Create transcripts table 9995 transcripts_table = self.create_transcript_view() 9996 9997 # Add info field 9998 if transcripts_table: 9999 self.transcript_view_to_variants( 10000 transcripts_table=transcripts_table, 10001 transcripts_info_field_json=info_json, 10002 transcripts_info_field_format=info_format, 10003 ) 10004 else: 10005 log.info("No Transcripts to process. Check param.json file configuration")
The calculation_transcripts_annotation function creates a transcripts table and adds an info
field to it if transcripts are available.
Parameters
- info_json: The
info_jsonparameter in thecalculation_transcripts_annotationmethod is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to " - info_format: The
info_formatparameter in thecalculation_transcripts_annotationmethod is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
10007 def calculation_transcripts_prioritization(self) -> None: 10008 """ 10009 The function `calculation_transcripts_prioritization` creates a transcripts table and 10010 prioritizes transcripts based on certain criteria. 10011 """ 10012 10013 # Create transcripts table 10014 transcripts_table = self.create_transcript_view() 10015 10016 # Add info field 10017 if transcripts_table: 10018 self.transcripts_prioritization(transcripts_table=transcripts_table) 10019 else: 10020 log.info("No Transcripts to process. Check param.json file configuration")
The function calculation_transcripts_prioritization creates a transcripts table and
prioritizes transcripts based on certain criteria.
10022 def calculation_transcripts_export(self) -> None: 10023 """ """ 10024 10025 # Create transcripts table 10026 transcripts_table = self.create_transcript_view() 10027 10028 # Add info field 10029 if transcripts_table: 10030 self.transcripts_export(transcripts_table=transcripts_table) 10031 else: 10032 log.info("No Transcripts to process. Check param.json file configuration")
10038 def transcripts_export( 10039 self, transcripts_table: str = None, param: dict = {} 10040 ) -> bool: 10041 """ """ 10042 10043 log.debug("Start transcripts export...") 10044 10045 # Param 10046 if not param: 10047 param = self.get_param() 10048 10049 # Param export 10050 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10051 10052 # Output file 10053 transcripts_export_output = param_transcript_export.get("output", None) 10054 10055 if not param_transcript_export or not transcripts_export_output: 10056 log.warning(f"No transcriipts export parameters defined!") 10057 return False 10058 10059 # List of transcripts annotations 10060 query_describe = f""" 10061 SELECT column_name 10062 FROM ( 10063 DESCRIBE SELECT * FROM {transcripts_table} 10064 ) 10065 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10066 """ 10067 transcripts_annotations_list = list( 10068 self.get_query_to_df(query=query_describe)["column_name"] 10069 ) 10070 10071 # Create transcripts table for export 10072 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10073 random.choices(string.ascii_uppercase + string.digits, k=10) 10074 ) 10075 query_create_transcripts_table_export = f""" 10076 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10077 """ 10078 self.execute_query(query=query_create_transcripts_table_export) 10079 10080 # Output file format 10081 transcripts_export_output_format = get_file_format( 10082 filename=transcripts_export_output 10083 ) 10084 10085 # Format VCF - construct INFO 10086 if transcripts_export_output_format in ["vcf"]: 10087 10088 # Construct query update INFO and header 10089 query_update_info = [] 10090 for field in transcripts_annotations_list: 10091 10092 # If field not in header 10093 if field not in self.get_header_infos_list(): 10094 10095 # Add PZ Transcript in header 10096 self.get_header().infos[field] = vcf.parser._Info( 10097 field, 10098 ".", 10099 "String", 10100 f"Annotation '{field}' from transcript view", 10101 "unknown", 10102 "unknown", 10103 0, 10104 ) 10105 10106 # Add field as INFO/tag 10107 query_update_info.append( 10108 f""" 10109 CASE 10110 WHEN "{field}" IS NOT NULL 10111 THEN concat('{field}=', "{field}", ';') 10112 ELSE '' 10113 END 10114 """ 10115 ) 10116 10117 # Query param 10118 query_update_info_value = ( 10119 f""" concat('', {", ".join(query_update_info)}) """ 10120 ) 10121 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10122 10123 else: 10124 10125 # Query param 10126 query_update_info_value = f""" NULL """ 10127 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10128 10129 # Update query INFO column 10130 query_update = f""" 10131 UPDATE {transcripts_table_export} 10132 SET INFO = {query_update_info_value} 10133 10134 """ 10135 self.execute_query(query=query_update) 10136 10137 # Export 10138 self.export_output( 10139 output_file=transcripts_export_output, 10140 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10141 ) 10142 10143 # Drop transcripts export table 10144 query_drop_transcripts_table_export = f""" 10145 DROP TABLE {transcripts_table_export} 10146 """ 10147 self.execute_query(query=query_drop_transcripts_table_export)
10149 def transcripts_prioritization( 10150 self, transcripts_table: str = None, param: dict = {} 10151 ) -> bool: 10152 """ 10153 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10154 and updates the variants table with the prioritized information. 10155 10156 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10157 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10158 This parameter is used to identify the table where the transcripts data is stored for the 10159 prioritization process 10160 :type transcripts_table: str 10161 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10162 that contains various configuration settings for the prioritization process of transcripts. It 10163 is used to customize the behavior of the prioritization algorithm and includes settings such as 10164 the prefix for prioritization fields, default profiles, and other 10165 :type param: dict 10166 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10167 transcripts prioritization process is successfully completed, and `False` if there are any 10168 issues or if no profile is defined for transcripts prioritization. 10169 """ 10170 10171 log.debug("Start transcripts prioritization...") 10172 10173 # Param 10174 if not param: 10175 param = self.get_param() 10176 10177 # Variants table 10178 table_variants = self.get_table_variants() 10179 10180 # Transcripts table 10181 if transcripts_table is None: 10182 transcripts_table = self.create_transcript_view( 10183 transcripts_table="transcripts", param=param 10184 ) 10185 if transcripts_table is None: 10186 msg_err = "No Transcripts table availalble" 10187 log.error(msg_err) 10188 raise ValueError(msg_err) 10189 log.debug(f"transcripts_table={transcripts_table}") 10190 10191 # Get transcripts columns 10192 columns_as_list_query = f""" 10193 DESCRIBE {transcripts_table} 10194 """ 10195 columns_as_list = list( 10196 self.get_query_to_df(columns_as_list_query)["column_name"] 10197 ) 10198 10199 # Create INFO if not exists 10200 if "INFO" not in columns_as_list: 10201 query_add_info = f""" 10202 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10203 """ 10204 self.execute_query(query_add_info) 10205 10206 # Prioritization param and Force only PZ Score and Flag 10207 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10208 10209 # PZ profile by default 10210 pz_profile_default = ( 10211 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10212 ) 10213 10214 # Exit if no profile 10215 if pz_profile_default is None: 10216 log.warning("No profile defined for transcripts prioritization") 10217 return False 10218 10219 # PZ fields 10220 pz_param_pzfields = {} 10221 10222 # PZ field transcripts 10223 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10224 10225 # Add PZ Transcript in header 10226 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10227 pz_fields_transcripts, 10228 ".", 10229 "String", 10230 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10231 "unknown", 10232 "unknown", 10233 code_type_map["String"], 10234 ) 10235 10236 # Mandatory fields 10237 pz_mandatory_fields_list = [ 10238 "Score", 10239 "Flag", 10240 "Tags", 10241 "Comment", 10242 "Infos", 10243 "Class", 10244 ] 10245 pz_mandatory_fields = [] 10246 for pz_mandatory_field in pz_mandatory_fields_list: 10247 pz_mandatory_fields.append( 10248 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10249 ) 10250 10251 # PZ fields in param 10252 for pz_field in pz_param.get("pzfields", []): 10253 if pz_field in pz_mandatory_fields_list: 10254 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10255 pz_param.get("pzprefix", "PTZ") + pz_field 10256 ) 10257 else: 10258 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10259 pz_param_pzfields[pz_field] = pz_field_new 10260 10261 # Add PZ Transcript in header 10262 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10263 pz_field_new, 10264 ".", 10265 "String", 10266 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10267 "unknown", 10268 "unknown", 10269 code_type_map["String"], 10270 ) 10271 10272 # PZ fields param 10273 pz_param["pzfields"] = pz_mandatory_fields 10274 10275 # Prioritization 10276 prioritization_result = self.prioritization( 10277 table=transcripts_table, 10278 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10279 ) 10280 if not prioritization_result: 10281 log.warning("Transcripts prioritization not processed") 10282 return False 10283 10284 # PZ fields sql query 10285 query_update_select_list = [] 10286 query_update_concat_list = [] 10287 query_update_order_list = [] 10288 for pz_param_pzfield in set( 10289 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10290 ): 10291 query_update_select_list.append(f" {pz_param_pzfield}, ") 10292 10293 for pz_param_pzfield in pz_param_pzfields: 10294 query_update_concat_list.append( 10295 f""" 10296 , CASE 10297 WHEN {pz_param_pzfield} IS NOT NULL 10298 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10299 ELSE '' 10300 END 10301 """ 10302 ) 10303 10304 # Order by 10305 pz_orders = ( 10306 param.get("transcripts", {}) 10307 .get("prioritization", {}) 10308 .get("prioritization_transcripts_order", {}) 10309 ) 10310 if not pz_orders: 10311 pz_orders = { 10312 pz_param.get("pzprefix", "PTZ") + "Flag": "ASC", 10313 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10314 } 10315 for pz_order in pz_orders: 10316 query_update_order_list.append( 10317 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10318 ) 10319 10320 # Fields to explode 10321 fields_to_explode = ( 10322 list(pz_param_pzfields.keys()) 10323 + pz_mandatory_fields 10324 + list(pz_orders.keys()) 10325 ) 10326 # Remove transcript column as a specific transcript column 10327 if "transcript" in fields_to_explode: 10328 fields_to_explode.remove("transcript") 10329 10330 # Fields intranscripts table 10331 query_transcripts_table = f""" 10332 DESCRIBE SELECT * FROM {transcripts_table} 10333 """ 10334 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10335 10336 # Check fields to explode 10337 for field_to_explode in fields_to_explode: 10338 if field_to_explode not in self.get_header_infos_list() + list( 10339 query_transcripts_table.column_name 10340 ): 10341 msg_err = f"INFO/{field_to_explode} NOT IN header" 10342 log.error(msg_err) 10343 raise ValueError(msg_err) 10344 10345 # Explode fields to explode 10346 self.explode_infos( 10347 table=transcripts_table, 10348 fields=fields_to_explode, 10349 ) 10350 10351 # Transcript preference file 10352 transcripts_preference_file = ( 10353 param.get("transcripts", {}) 10354 .get("prioritization", {}) 10355 .get("prioritization_transcripts", {}) 10356 ) 10357 transcripts_preference_file = full_path(transcripts_preference_file) 10358 10359 # Transcript preference forced 10360 transcript_preference_force = ( 10361 param.get("transcripts", {}) 10362 .get("prioritization", {}) 10363 .get("prioritization_transcripts_force", False) 10364 ) 10365 # Transcript version forced 10366 transcript_version_force = ( 10367 param.get("transcripts", {}) 10368 .get("prioritization", {}) 10369 .get("prioritization_transcripts_version_force", False) 10370 ) 10371 10372 # Transcripts Ranking 10373 if transcripts_preference_file: 10374 10375 # Transcripts file to dataframe 10376 if os.path.exists(transcripts_preference_file): 10377 transcripts_preference_dataframe = transcripts_file_to_df( 10378 transcripts_preference_file 10379 ) 10380 else: 10381 log.error( 10382 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10383 ) 10384 raise ValueError( 10385 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10386 ) 10387 10388 # Order by depending to transcript preference forcing 10389 if transcript_preference_force: 10390 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10391 else: 10392 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10393 10394 # Transcript columns joined depend on version consideration 10395 if transcript_version_force: 10396 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10397 else: 10398 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10399 10400 # Query ranking for update 10401 query_update_ranking = f""" 10402 SELECT 10403 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10404 ROW_NUMBER() OVER ( 10405 PARTITION BY "#CHROM", POS, REF, ALT 10406 ORDER BY {order_by} 10407 ) AS rn 10408 FROM {transcripts_table} 10409 LEFT JOIN 10410 ( 10411 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10412 FROM transcripts_preference_dataframe 10413 ) AS transcripts_preference 10414 ON {transcripts_version_join} 10415 """ 10416 10417 else: 10418 10419 # Query ranking for update 10420 query_update_ranking = f""" 10421 SELECT 10422 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10423 ROW_NUMBER() OVER ( 10424 PARTITION BY "#CHROM", POS, REF, ALT 10425 ORDER BY {" , ".join(query_update_order_list)} 10426 ) AS rn 10427 FROM {transcripts_table} 10428 """ 10429 10430 # Export Transcripts prioritization infos to variants table 10431 query_update = f""" 10432 WITH RankedTranscripts AS ( 10433 {query_update_ranking} 10434 ) 10435 UPDATE {table_variants} 10436 SET 10437 INFO = CONCAT(CASE 10438 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10439 THEN '' 10440 ELSE concat("INFO", ';') 10441 END, 10442 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10443 ) 10444 FROM 10445 RankedTranscripts 10446 WHERE 10447 rn = 1 10448 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10449 AND variants."POS" = RankedTranscripts."POS" 10450 AND variants."REF" = RankedTranscripts."REF" 10451 AND variants."ALT" = RankedTranscripts."ALT" 10452 """ 10453 10454 # log.debug(f"query_update={query_update}") 10455 self.execute_query(query=query_update) 10456 10457 # Return 10458 return True
The transcripts_prioritization function prioritizes transcripts based on certain parameters
and updates the variants table with the prioritized information.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process - param: The
paramparameter in thetranscripts_prioritizationmethod is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns
The function
transcripts_prioritizationreturns a boolean valueTrueif the transcripts prioritization process is successfully completed, andFalseif there are any issues or if no profile is defined for transcripts prioritization.
10460 def create_transcript_view_from_columns_map( 10461 self, 10462 transcripts_table: str = "transcripts", 10463 columns_maps: dict = {}, 10464 added_columns: list = [], 10465 temporary_tables: list = None, 10466 annotation_fields: list = None, 10467 column_rename: dict = {}, 10468 column_clean: bool = False, 10469 column_case: str = None, 10470 ) -> tuple[list, list, list]: 10471 """ 10472 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10473 specified columns mapping for transcripts data. 10474 10475 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10476 of the table where the transcripts data is stored or will be stored in the database. This table 10477 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10478 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10479 :type transcripts_table: str (optional) 10480 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10481 about how to map columns from a transcripts table to create a view. Each entry in the 10482 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10483 typically includes details such as the main transcript column and additional information columns 10484 :type columns_maps: dict 10485 :param added_columns: The `added_columns` parameter in the 10486 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10487 that will be added to the view being created based on the columns map provided. These columns 10488 are generated by exploding the transcript information columns along with the main transcript 10489 column 10490 :type added_columns: list 10491 :param temporary_tables: The `temporary_tables` parameter in the 10492 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10493 tables created during the process of creating a transcript view from a columns map. These 10494 temporary tables are used to store intermediate results or transformations before the final view 10495 is generated 10496 :type temporary_tables: list 10497 :param annotation_fields: The `annotation_fields` parameter in the 10498 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10499 used for annotation in the query view creation process. These fields are extracted from the 10500 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10501 :type annotation_fields: list 10502 :param column_rename: The `column_rename` parameter in the 10503 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10504 custom renaming for columns during the creation of the temporary table view. This parameter 10505 provides a mapping of original column names to the desired renamed column names. By using this 10506 parameter, 10507 :type column_rename: dict 10508 :param column_clean: The `column_clean` parameter in the 10509 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10510 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10511 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10512 False 10513 :type column_clean: bool (optional) 10514 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10515 function is used to specify the case transformation to be applied to the columns during the view 10516 creation process. It allows you to control whether the column values should be converted to 10517 lowercase, uppercase, or remain unchanged 10518 :type column_case: str 10519 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10520 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10521 """ 10522 10523 log.debug("Start transcrpts view creation from columns map...") 10524 10525 # "from_columns_map": [ 10526 # { 10527 # "transcripts_column": "Ensembl_transcriptid", 10528 # "transcripts_infos_columns": [ 10529 # "genename", 10530 # "Ensembl_geneid", 10531 # "LIST_S2_score", 10532 # "LIST_S2_pred", 10533 # ], 10534 # }, 10535 # { 10536 # "transcripts_column": "Ensembl_transcriptid", 10537 # "transcripts_infos_columns": [ 10538 # "genename", 10539 # "VARITY_R_score", 10540 # "Aloft_pred", 10541 # ], 10542 # }, 10543 # ], 10544 10545 # Init 10546 if temporary_tables is None: 10547 temporary_tables = [] 10548 if annotation_fields is None: 10549 annotation_fields = [] 10550 10551 # Variants table 10552 table_variants = self.get_table_variants() 10553 10554 for columns_map in columns_maps: 10555 10556 # Transcript column 10557 transcripts_column = columns_map.get("transcripts_column", None) 10558 10559 # Transcripts infos columns 10560 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10561 10562 # Transcripts infos columns rename 10563 column_rename = columns_map.get("column_rename", column_rename) 10564 10565 # Transcripts infos columns clean 10566 column_clean = columns_map.get("column_clean", column_clean) 10567 10568 # Transcripts infos columns case 10569 column_case = columns_map.get("column_case", column_case) 10570 10571 if transcripts_column is not None: 10572 10573 # Explode 10574 added_columns += self.explode_infos( 10575 fields=[transcripts_column] + transcripts_infos_columns 10576 ) 10577 10578 # View clauses 10579 clause_select_variants = [] 10580 clause_select_tanscripts = [] 10581 for field in [transcripts_column] + transcripts_infos_columns: 10582 10583 # AS field 10584 as_field = field 10585 10586 # Rename 10587 if column_rename: 10588 as_field = column_rename.get(as_field, as_field) 10589 10590 # Clean 10591 if column_clean: 10592 as_field = clean_annotation_field(as_field) 10593 10594 # Case 10595 if column_case: 10596 if column_case.lower() in ["lower"]: 10597 as_field = as_field.lower() 10598 elif column_case.lower() in ["upper"]: 10599 as_field = as_field.upper() 10600 10601 # Clause select Variants 10602 clause_select_variants.append( 10603 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10604 ) 10605 10606 if field in [transcripts_column]: 10607 clause_select_tanscripts.append( 10608 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10609 ) 10610 else: 10611 clause_select_tanscripts.append( 10612 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10613 ) 10614 annotation_fields.append(as_field) 10615 10616 # Querey View 10617 query = f""" 10618 SELECT 10619 "#CHROM", POS, REF, ALT, INFO, 10620 "{transcripts_column}" AS 'transcript', 10621 {", ".join(clause_select_tanscripts)} 10622 FROM ( 10623 SELECT 10624 "#CHROM", POS, REF, ALT, INFO, 10625 {", ".join(clause_select_variants)} 10626 FROM {table_variants} 10627 ) 10628 WHERE "{transcripts_column}" IS NOT NULL 10629 """ 10630 10631 # Create temporary table 10632 temporary_table = transcripts_table + "".join( 10633 random.choices(string.ascii_uppercase + string.digits, k=10) 10634 ) 10635 10636 # Temporary_tables 10637 temporary_tables.append(temporary_table) 10638 query_view = f""" 10639 CREATE TEMPORARY TABLE {temporary_table} 10640 AS ({query}) 10641 """ 10642 self.execute_query(query=query_view) 10643 10644 return added_columns, temporary_tables, annotation_fields
The create_transcript_view_from_columns_map function generates a temporary table view based on
specified columns mapping for transcripts data.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts - columns_maps: The
columns_mapsparameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in thecolumns_mapslist represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns - added_columns: The
added_columnsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from thetranscripts_columnandtranscripts_infos_columnsspecified in the `columns - column_rename: The
column_renameparameter in thecreate_transcript_view_from_columns_mapfunction is a dictionary that allows you to specify custom renaming for columns during the creation of the temporary table view. This parameter provides a mapping of original column names to the desired renamed column names. By using this parameter, - column_clean: The
column_cleanparameter in thecreate_transcript_view_from_columns_mapfunction is a boolean flag that determines whether the column values should be cleaned or not. If set toTrue, the column values will be cleaned by removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to False - column_case: The
column_caseparameter in thecreate_transcript_view_from_columns_mapfunction is used to specify the case transformation to be applied to the columns during the view creation process. It allows you to control whether the column values should be converted to lowercase, uppercase, or remain unchanged
Returns
The
create_transcript_view_from_columns_mapfunction returns a tuple containing three lists:added_columns,temporary_tables, andannotation_fields.
10646 def create_transcript_view_from_column_format( 10647 self, 10648 transcripts_table: str = "transcripts", 10649 column_formats: dict = {}, 10650 temporary_tables: list = None, 10651 annotation_fields: list = None, 10652 column_rename: dict = {}, 10653 column_clean: bool = False, 10654 column_case: str = None, 10655 ) -> tuple[list, list, list]: 10656 """ 10657 The `create_transcript_view_from_column_format` function generates a transcript view based on 10658 specified column formats, adds additional columns and annotation fields, and returns the list of 10659 temporary tables and annotation fields. 10660 10661 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10662 of the table containing the transcripts data. This table will be used as the base table for 10663 creating the transcript view. The default value for this parameter is "transcripts", but you can 10664 provide a different table name if needed, defaults to transcripts 10665 :type transcripts_table: str (optional) 10666 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10667 about the columns to be used for creating the transcript view. Each entry in the dictionary 10668 specifies the mapping between a transcripts column and a transcripts infos column. This 10669 parameter allows you to define how the columns from the transcripts table should be transformed 10670 or mapped 10671 :type column_formats: dict 10672 :param temporary_tables: The `temporary_tables` parameter in the 10673 `create_transcript_view_from_column_format` function is a list that stores the names of 10674 temporary views created during the process of creating a transcript view from a column format. 10675 These temporary views are used to manipulate and extract data before generating the final 10676 transcript view 10677 :type temporary_tables: list 10678 :param annotation_fields: The `annotation_fields` parameter in the 10679 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10680 that are extracted from the temporary views created during the process. These annotation fields 10681 are obtained by querying the temporary views and extracting the column names excluding specific 10682 columns like `#CH 10683 :type annotation_fields: list 10684 :param column_rename: The `column_rename` parameter in the 10685 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10686 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10687 column names to new column names in this dictionary, you can rename specific columns during the 10688 process 10689 :type column_rename: dict 10690 :param column_clean: The `column_clean` parameter in the 10691 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10692 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10693 will be cleaned during the creation of the transcript view based on the specified column format, 10694 defaults to False 10695 :type column_clean: bool (optional) 10696 :param column_case: The `column_case` parameter in the 10697 `create_transcript_view_from_column_format` function is used to specify the case transformation 10698 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10699 to convert the column names to uppercase or lowercase, respectively 10700 :type column_case: str 10701 :return: The `create_transcript_view_from_column_format` function returns two lists: 10702 `temporary_tables` and `annotation_fields`. 10703 """ 10704 10705 log.debug("Start transcrpts view creation from column format...") 10706 10707 # "from_column_format": [ 10708 # { 10709 # "transcripts_column": "ANN", 10710 # "transcripts_infos_column": "Feature_ID", 10711 # } 10712 # ], 10713 10714 # Init 10715 if temporary_tables is None: 10716 temporary_tables = [] 10717 if annotation_fields is None: 10718 annotation_fields = [] 10719 10720 for column_format in column_formats: 10721 10722 # annotation field and transcript annotation field 10723 annotation_field = column_format.get("transcripts_column", "ANN") 10724 transcript_annotation = column_format.get( 10725 "transcripts_infos_column", "Feature_ID" 10726 ) 10727 10728 # Transcripts infos columns rename 10729 column_rename = column_format.get("column_rename", column_rename) 10730 10731 # Transcripts infos columns clean 10732 column_clean = column_format.get("column_clean", column_clean) 10733 10734 # Transcripts infos columns case 10735 column_case = column_format.get("column_case", column_case) 10736 10737 # Temporary View name 10738 temporary_view_name = transcripts_table + "".join( 10739 random.choices(string.ascii_uppercase + string.digits, k=10) 10740 ) 10741 10742 # Create temporary view name 10743 temporary_view_name = self.annotation_format_to_table( 10744 uniquify=True, 10745 annotation_field=annotation_field, 10746 view_name=temporary_view_name, 10747 annotation_id=transcript_annotation, 10748 column_rename=column_rename, 10749 column_clean=column_clean, 10750 column_case=column_case, 10751 ) 10752 10753 # Annotation fields 10754 if temporary_view_name: 10755 query_annotation_fields = f""" 10756 SELECT * 10757 FROM ( 10758 DESCRIBE SELECT * 10759 FROM {temporary_view_name} 10760 ) 10761 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10762 """ 10763 df_annotation_fields = self.get_query_to_df( 10764 query=query_annotation_fields 10765 ) 10766 10767 # Add temporary view and annotation fields 10768 temporary_tables.append(temporary_view_name) 10769 annotation_fields += list(set(df_annotation_fields["column_name"])) 10770 10771 return temporary_tables, annotation_fields
The create_transcript_view_from_column_format function generates a transcript view based on
specified column formats, adds additional columns and annotation fields, and returns the list of
temporary tables and annotation fields.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts - column_formats: The
column_formatsparameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. This parameter allows you to define how the columns from the transcripts table should be transformed or mapped - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH - column_rename: The
column_renameparameter in thecreate_transcript_view_from_column_formatfunction is a dictionary that allows you to specify custom renaming of columns in the transcripts infos table. By providing a mapping of original column names to new column names in this dictionary, you can rename specific columns during the process - column_clean: The
column_cleanparameter in thecreate_transcript_view_from_column_formatfunction is a boolean flag that determines whether the transcripts infos columns should undergo a cleaning process. If set toTrue, the columns will be cleaned during the creation of the transcript view based on the specified column format, defaults to False - column_case: The
column_caseparameter in thecreate_transcript_view_from_column_formatfunction is used to specify the case transformation to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" to convert the column names to uppercase or lowercase, respectively
Returns
The
create_transcript_view_from_column_formatfunction returns two lists:temporary_tablesandannotation_fields.
10773 def create_transcript_view( 10774 self, 10775 transcripts_table: str = None, 10776 transcripts_table_drop: bool = True, 10777 param: dict = {}, 10778 ) -> str: 10779 """ 10780 The `create_transcript_view` function generates a transcript view by processing data from a 10781 specified table based on provided parameters and structural information. 10782 10783 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10784 is used to specify the name of the table that will store the final transcript view data. If a table 10785 name is not provided, the function will create a new table to store the transcript view data, and by 10786 default,, defaults to transcripts 10787 :type transcripts_table: str (optional) 10788 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10789 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10790 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10791 the function will drop the existing transcripts table if it exists, defaults to True 10792 :type transcripts_table_drop: bool (optional) 10793 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10794 contains information needed to create a transcript view. It includes details such as the structure 10795 of the transcripts, columns mapping, column formats, and other necessary information for generating 10796 the view. This parameter allows for flexibility and customization 10797 :type param: dict 10798 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10799 created or modified during the execution of the function. 10800 """ 10801 10802 log.debug("Start transcripts view creation...") 10803 10804 # Default 10805 transcripts_table_default = "transcripts" 10806 10807 # Param 10808 if not param: 10809 param = self.get_param() 10810 10811 # Struct 10812 struct = param.get("transcripts", {}).get("struct", None) 10813 10814 # Transcript veresion 10815 transcript_id_remove_version = param.get("transcripts", {}).get( 10816 "transcript_id_remove_version", False 10817 ) 10818 10819 # Transcripts mapping 10820 transcript_id_mapping_file = param.get("transcripts", {}).get( 10821 "transcript_id_mapping_file", None 10822 ) 10823 10824 # Transcripts mapping 10825 transcript_id_mapping_force = param.get("transcripts", {}).get( 10826 "transcript_id_mapping_force", None 10827 ) 10828 10829 if struct: 10830 10831 # Transcripts table 10832 if transcripts_table is None: 10833 transcripts_table = param.get("transcripts", {}).get( 10834 "table", transcripts_table_default 10835 ) 10836 10837 # added_columns 10838 added_columns = [] 10839 10840 # Temporary tables 10841 temporary_tables = [] 10842 10843 # Annotation fields 10844 annotation_fields = [] 10845 10846 # from columns map 10847 columns_maps = struct.get("from_columns_map", []) 10848 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10849 self.create_transcript_view_from_columns_map( 10850 transcripts_table=transcripts_table, 10851 columns_maps=columns_maps, 10852 added_columns=added_columns, 10853 temporary_tables=temporary_tables, 10854 annotation_fields=annotation_fields, 10855 ) 10856 ) 10857 added_columns += added_columns_tmp 10858 temporary_tables += temporary_tables_tmp 10859 annotation_fields += annotation_fields_tmp 10860 10861 # from column format 10862 column_formats = struct.get("from_column_format", []) 10863 temporary_tables_tmp, annotation_fields_tmp = ( 10864 self.create_transcript_view_from_column_format( 10865 transcripts_table=transcripts_table, 10866 column_formats=column_formats, 10867 temporary_tables=temporary_tables, 10868 annotation_fields=annotation_fields, 10869 ) 10870 ) 10871 temporary_tables += temporary_tables_tmp 10872 annotation_fields += annotation_fields_tmp 10873 10874 # Remove some specific fields/column 10875 annotation_fields = list(set(annotation_fields)) 10876 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10877 if field in annotation_fields: 10878 annotation_fields.remove(field) 10879 10880 # Merge temporary tables query 10881 query_merge = "" 10882 for temporary_table in list(set(temporary_tables)): 10883 10884 # First temporary table 10885 if not query_merge: 10886 query_merge = f""" 10887 SELECT * FROM {temporary_table} 10888 """ 10889 # other temporary table (using UNION) 10890 else: 10891 query_merge += f""" 10892 UNION BY NAME SELECT * FROM {temporary_table} 10893 """ 10894 10895 # transcript table tmp 10896 transcript_table_tmp = "transcripts_tmp" 10897 transcript_table_tmp2 = "transcripts_tmp2" 10898 transcript_table_tmp3 = "transcripts_tmp3" 10899 10900 # Merge on transcript 10901 query_merge_on_transcripts_annotation_fields = [] 10902 10903 # Add transcript list 10904 query_merge_on_transcripts_annotation_fields.append( 10905 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 10906 ) 10907 10908 # Aggregate all annotations fields 10909 for annotation_field in set(annotation_fields): 10910 query_merge_on_transcripts_annotation_fields.append( 10911 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10912 ) 10913 10914 # Transcripts mapping 10915 if transcript_id_mapping_file: 10916 10917 # Transcript dataframe 10918 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 10919 transcript_id_mapping_dataframe = transcripts_file_to_df( 10920 transcript_id_mapping_file, column_names=["transcript", "alias"] 10921 ) 10922 10923 # Transcript version remove 10924 if transcript_id_remove_version: 10925 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 10926 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 10927 query_left_join = f""" 10928 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10929 """ 10930 else: 10931 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 10932 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 10933 query_left_join = f""" 10934 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10935 """ 10936 10937 # Transcript column for group by merge 10938 query_transcript_merge_group_by = """ 10939 CASE 10940 WHEN transcript_mapped NOT IN ('') 10941 THEN split_part(transcript_mapped, '.', 1) 10942 ELSE split_part(transcript_original, '.', 1) 10943 END 10944 """ 10945 10946 # Merge query 10947 transcripts_tmp2_query = f""" 10948 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 10949 FROM ({query_merge}) AS {transcript_table_tmp} 10950 {query_left_join} 10951 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 10952 """ 10953 10954 # Retrive columns after mege 10955 transcripts_tmp2_describe_query = f""" 10956 DESCRIBE {transcripts_tmp2_query} 10957 """ 10958 transcripts_tmp2_describe_list = list( 10959 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 10960 "column_name" 10961 ] 10962 ) 10963 10964 # Create list of columns for select clause 10965 transcripts_tmp2_describe_select_clause = [] 10966 for field in transcripts_tmp2_describe_list: 10967 if field not in [ 10968 "#CHROM", 10969 "POS", 10970 "REF", 10971 "ALT", 10972 "INFO", 10973 "transcript_mapped", 10974 ]: 10975 as_field = field 10976 if field in ["transcript_original"]: 10977 as_field = "transcripts_mapped" 10978 transcripts_tmp2_describe_select_clause.append( 10979 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 10980 ) 10981 10982 # Merge with mapping 10983 query_merge_on_transcripts = f""" 10984 SELECT 10985 "#CHROM", POS, REF, ALT, INFO, 10986 CASE 10987 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 10988 THEN ANY_VALUE(transcript_mapped) 10989 ELSE ANY_VALUE(transcript_original) 10990 END AS transcript, 10991 {", ".join(transcripts_tmp2_describe_select_clause)} 10992 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 10993 GROUP BY "#CHROM", POS, REF, ALT, INFO, 10994 {query_transcript_merge_group_by} 10995 """ 10996 10997 # Add transcript filter from mapping file 10998 if transcript_id_mapping_force: 10999 query_merge_on_transcripts = f""" 11000 SELECT * 11001 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11002 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11003 """ 11004 11005 # No transcript mapping 11006 else: 11007 11008 # Remove transcript version 11009 if transcript_id_remove_version: 11010 query_transcript_column = f""" 11011 split_part({transcript_table_tmp}.transcript, '.', 1) 11012 """ 11013 else: 11014 query_transcript_column = """ 11015 transcript 11016 """ 11017 11018 # Query sections 11019 query_transcript_column_select = ( 11020 f"{query_transcript_column} AS transcript" 11021 ) 11022 query_transcript_column_group_by = query_transcript_column 11023 11024 # Query for transcripts view 11025 query_merge_on_transcripts = f""" 11026 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11027 FROM ({query_merge}) AS {transcript_table_tmp} 11028 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11029 """ 11030 11031 log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}") 11032 11033 # Drop transcript view is necessary 11034 if transcripts_table_drop: 11035 query_drop = f""" 11036 DROP TABLE IF EXISTS {transcripts_table}; 11037 """ 11038 self.execute_query(query=query_drop) 11039 11040 # Merge and create transcript view 11041 query_create_view = f""" 11042 CREATE TABLE IF NOT EXISTS {transcripts_table} 11043 AS {query_merge_on_transcripts} 11044 """ 11045 self.execute_query(query=query_create_view) 11046 11047 # Remove added columns 11048 for added_column in added_columns: 11049 self.drop_column(column=added_column) 11050 11051 else: 11052 11053 transcripts_table = None 11054 11055 return transcripts_table
The create_transcript_view function generates a transcript view by processing data from a
specified table based on provided parameters and structural information.
Parameters
- transcripts_table: The
transcripts_tableparameter in thecreate_transcript_viewfunction is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts - transcripts_table_drop: The
transcripts_table_dropparameter in thecreate_transcript_viewfunction is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. Iftranscripts_table_dropis set toTrue, the function will drop the existing transcripts table if it exists, defaults to True - param: The
paramparameter in thecreate_transcript_viewfunction is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns
The
create_transcript_viewfunction returns the name of the transcripts table that was created or modified during the execution of the function.
11057 def annotation_format_to_table( 11058 self, 11059 uniquify: bool = True, 11060 annotation_field: str = "ANN", 11061 annotation_id: str = "Feature_ID", 11062 view_name: str = "transcripts", 11063 column_rename: dict = {}, 11064 column_clean: bool = False, 11065 column_case: str = None, 11066 ) -> str: 11067 """ 11068 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11069 structured table format, ensuring unique values and creating a temporary table for further 11070 processing or analysis. 11071 11072 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11073 unique values in the output or not. If set to `True`, the function will make sure that the 11074 output values are unique, defaults to True 11075 :type uniquify: bool (optional) 11076 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11077 that contains the annotation information for each variant. This field is used to extract the 11078 annotation details for further processing in the function. By default, it is set to "ANN", 11079 defaults to ANN 11080 :type annotation_field: str (optional) 11081 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11082 is used to specify the identifier for the annotation feature. This identifier will be used as a 11083 column name in the resulting table or view that is created based on the annotation data. It 11084 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11085 :type annotation_id: str (optional) 11086 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11087 to specify the name of the temporary table that will be created to store the transformed 11088 annotation data. This table will hold the extracted information from the annotation field in a 11089 structured format for further processing or analysis. By default,, defaults to transcripts 11090 :type view_name: str (optional) 11091 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11092 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11093 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11094 created based on the annotation data. This feature enables 11095 :type column_rename: dict 11096 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11097 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11098 If set to `True`, the function will clean the annotation field before further processing. This 11099 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11100 to False 11101 :type column_clean: bool (optional) 11102 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11103 used to specify the case transformation to be applied to the column names extracted from the 11104 annotation data. It allows you to set the case of the column names to either lowercase or 11105 uppercase for consistency or other specific requirements during the conversion 11106 :type column_case: str 11107 :return: The function `annotation_format_to_table` is returning the name of the view created, 11108 which is stored in the variable `view_name`. 11109 """ 11110 11111 # Annotation field 11112 annotation_format = "annotation_explode" 11113 11114 # Transcript annotation 11115 if column_rename: 11116 annotation_id = column_rename.get(annotation_id, annotation_id) 11117 11118 if column_clean: 11119 annotation_id = clean_annotation_field(annotation_id) 11120 11121 # Prefix 11122 prefix = self.get_explode_infos_prefix() 11123 if prefix: 11124 prefix = "INFO/" 11125 11126 # Annotation fields 11127 annotation_infos = prefix + annotation_field 11128 annotation_format_infos = prefix + annotation_format 11129 11130 # Variants table 11131 table_variants = self.get_table_variants() 11132 11133 # Header 11134 vcf_reader = self.get_header() 11135 11136 # Add columns 11137 added_columns = [] 11138 11139 # Explode HGVS field in column 11140 added_columns += self.explode_infos(fields=[annotation_field]) 11141 11142 if annotation_field in vcf_reader.infos: 11143 11144 # Extract ANN header 11145 ann_description = vcf_reader.infos[annotation_field].desc 11146 pattern = r"'(.+?)'" 11147 match = re.search(pattern, ann_description) 11148 if match: 11149 ann_header_match = match.group(1).split(" | ") 11150 ann_header = [] 11151 ann_header_desc = {} 11152 for i in range(len(ann_header_match)): 11153 ann_header_info = "".join( 11154 char for char in ann_header_match[i] if char.isalnum() 11155 ) 11156 ann_header.append(ann_header_info) 11157 ann_header_desc[ann_header_info] = ann_header_match[i] 11158 if not ann_header_desc: 11159 raise ValueError("Invalid header description format") 11160 else: 11161 raise ValueError("Invalid header description format") 11162 11163 # Create variant id 11164 variant_id_column = self.get_variant_id_column() 11165 added_columns += [variant_id_column] 11166 11167 # Create dataframe 11168 dataframe_annotation_format = self.get_query_to_df( 11169 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 11170 ) 11171 11172 # Create annotation columns 11173 dataframe_annotation_format[ 11174 annotation_format_infos 11175 ] = dataframe_annotation_format[annotation_infos].apply( 11176 lambda x: explode_annotation_format( 11177 annotation=str(x), 11178 uniquify=uniquify, 11179 output_format="JSON", 11180 prefix="", 11181 header=list(ann_header_desc.values()), 11182 ) 11183 ) 11184 11185 # Find keys 11186 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 11187 df_keys = self.get_query_to_df(query=query_json) 11188 11189 # Check keys 11190 query_json_key = [] 11191 for _, row in df_keys.iterrows(): 11192 11193 # Key 11194 key = row.iloc[0] 11195 key_clean = key 11196 11197 # key rename 11198 if column_rename: 11199 key_clean = column_rename.get(key_clean, key_clean) 11200 11201 # key clean 11202 if column_clean: 11203 key_clean = clean_annotation_field(key_clean) 11204 11205 # Key case 11206 if column_case: 11207 if column_case.lower() in ["lower"]: 11208 key_clean = key_clean.lower() 11209 elif column_case.lower() in ["upper"]: 11210 key_clean = key_clean.upper() 11211 11212 # Type 11213 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 11214 11215 # Get DataFrame from query 11216 df_json_type = self.get_query_to_df(query=query_json_type) 11217 11218 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 11219 with pd.option_context("future.no_silent_downcasting", True): 11220 df_json_type.fillna(value="", inplace=True) 11221 replace_dict = {None: np.nan, "": np.nan} 11222 df_json_type.replace(replace_dict, inplace=True) 11223 df_json_type.dropna(inplace=True) 11224 11225 # Detect column type 11226 column_type = detect_column_type(df_json_type[key_clean]) 11227 11228 # Append 11229 query_json_key.append( 11230 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11231 ) 11232 11233 # Create view 11234 query_view = f""" 11235 CREATE TEMPORARY TABLE {view_name} 11236 AS ( 11237 SELECT *, {annotation_id} AS 'transcript' 11238 FROM ( 11239 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11240 FROM dataframe_annotation_format 11241 ) 11242 ); 11243 """ 11244 self.execute_query(query=query_view) 11245 11246 else: 11247 11248 # Return None 11249 view_name = None 11250 11251 # Remove added columns 11252 for added_column in added_columns: 11253 self.drop_column(column=added_column) 11254 11255 return view_name
The annotation_format_to_table function converts annotation data from a VCF file into a
structured table format, ensuring unique values and creating a temporary table for further
processing or analysis.
Parameters
- uniquify: The
uniquifyparameter is a boolean flag that determines whether to ensure unique values in the output or not. If set toTrue, the function will make sure that the output values are unique, defaults to True - annotation_field: The
annotation_fieldparameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function. By default, it is set to "ANN", defaults to ANN - annotation_id: The
annotation_idparameter in theannotation_format_to_tablemethod is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID - view_name: The
view_nameparameter in theannotation_format_to_tablemethod is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis. By default,, defaults to transcripts - column_rename: The
column_renameparameter in theannotation_format_to_tablemethod is a dictionary that allows you to specify custom renaming for columns. By providing key-value pairs in this dictionary, you can rename specific columns in the resulting table or view that is created based on the annotation data. This feature enables - column_clean: The
column_cleanparameter in theannotation_format_to_tablemethod is a boolean flag that determines whether the annotation field should undergo a cleaning process. If set toTrue, the function will clean the annotation field before further processing. This cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults to False - column_case: The
column_caseparameter in theannotation_format_to_tablemethod is used to specify the case transformation to be applied to the column names extracted from the annotation data. It allows you to set the case of the column names to either lowercase or uppercase for consistency or other specific requirements during the conversion
Returns
The function
annotation_format_to_tableis returning the name of the view created, which is stored in the variableview_name.
11257 def transcript_view_to_variants( 11258 self, 11259 transcripts_table: str = None, 11260 transcripts_column_id: str = None, 11261 transcripts_info_json: str = None, 11262 transcripts_info_field_json: str = None, 11263 transcripts_info_format: str = None, 11264 transcripts_info_field_format: str = None, 11265 param: dict = {}, 11266 ) -> bool: 11267 """ 11268 The `transcript_view_to_variants` function updates a variants table with information from 11269 transcripts in JSON format. 11270 11271 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11272 table containing the transcripts data. If this parameter is not provided, the function will 11273 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11274 :type transcripts_table: str 11275 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11276 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11277 identifier is used to match transcripts with variants in the database 11278 :type transcripts_column_id: str 11279 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11280 of the column in the variants table where the transcripts information will be stored in JSON 11281 format. This parameter allows you to define the column in the variants table that will hold the 11282 JSON-formatted information about transcripts 11283 :type transcripts_info_json: str 11284 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11285 specify the field in the VCF header that will contain information about transcripts in JSON 11286 format. This field will be added to the VCF header as an INFO field with the specified name 11287 :type transcripts_info_field_json: str 11288 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11289 format of the information about transcripts that will be stored in the variants table. This 11290 format can be used to define how the transcript information will be structured or displayed 11291 within the variants table 11292 :type transcripts_info_format: str 11293 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11294 specify the field in the VCF header that will contain information about transcripts in a 11295 specific format. This field will be added to the VCF header as an INFO field with the specified 11296 name 11297 :type transcripts_info_field_format: str 11298 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11299 that contains various configuration settings related to transcripts. It is used to provide 11300 default values for certain parameters if they are not explicitly provided when calling the 11301 method. The `param` dictionary can be passed as an argument 11302 :type param: dict 11303 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11304 if the operation is successful and `False` if certain conditions are not met. 11305 """ 11306 11307 msg_info_prefix = "Start transcripts view to variants annotations" 11308 11309 log.debug(f"{msg_info_prefix}...") 11310 11311 # Default 11312 transcripts_table_default = "transcripts" 11313 transcripts_column_id_default = "transcript" 11314 transcripts_info_json_default = None 11315 transcripts_info_format_default = None 11316 transcripts_info_field_json_default = None 11317 transcripts_info_field_format_default = None 11318 11319 # Param 11320 if not param: 11321 param = self.get_param() 11322 11323 # Transcripts table 11324 if transcripts_table is None: 11325 transcripts_table = param.get("transcripts", {}).get( 11326 "table", transcripts_table_default 11327 ) 11328 11329 # Transcripts column ID 11330 if transcripts_column_id is None: 11331 transcripts_column_id = param.get("transcripts", {}).get( 11332 "column_id", transcripts_column_id_default 11333 ) 11334 11335 # Transcripts info json 11336 if transcripts_info_json is None: 11337 transcripts_info_json = param.get("transcripts", {}).get( 11338 "transcripts_info_json", transcripts_info_json_default 11339 ) 11340 11341 # Transcripts info field JSON 11342 if transcripts_info_field_json is None: 11343 transcripts_info_field_json = param.get("transcripts", {}).get( 11344 "transcripts_info_field_json", transcripts_info_field_json_default 11345 ) 11346 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11347 # transcripts_info_json = transcripts_info_field_json 11348 11349 # Transcripts info format 11350 if transcripts_info_format is None: 11351 transcripts_info_format = param.get("transcripts", {}).get( 11352 "transcripts_info_format", transcripts_info_format_default 11353 ) 11354 11355 # Transcripts info field FORMAT 11356 if transcripts_info_field_format is None: 11357 transcripts_info_field_format = param.get("transcripts", {}).get( 11358 "transcripts_info_field_format", transcripts_info_field_format_default 11359 ) 11360 # if ( 11361 # transcripts_info_field_format is not None 11362 # and transcripts_info_format is None 11363 # ): 11364 # transcripts_info_format = transcripts_info_field_format 11365 11366 # Variants table 11367 table_variants = self.get_table_variants() 11368 11369 # Check info columns param 11370 if ( 11371 transcripts_info_json is None 11372 and transcripts_info_field_json is None 11373 and transcripts_info_format is None 11374 and transcripts_info_field_format is None 11375 ): 11376 return False 11377 11378 # Transcripts infos columns 11379 query_transcripts_infos_columns = f""" 11380 SELECT * 11381 FROM ( 11382 DESCRIBE SELECT * FROM {transcripts_table} 11383 ) 11384 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11385 """ 11386 transcripts_infos_columns = list( 11387 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11388 ) 11389 11390 # View results 11391 clause_select = [] 11392 clause_to_json = [] 11393 clause_to_format = [] 11394 for field in transcripts_infos_columns: 11395 # Do not consider INFO field for export into fields 11396 if field not in ["INFO"]: 11397 clause_select.append( 11398 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11399 ) 11400 clause_to_json.append(f""" '{field}': "{field}" """) 11401 clause_to_format.append(f""" "{field}" """) 11402 11403 # Update 11404 update_set_json = [] 11405 update_set_format = [] 11406 11407 # VCF header 11408 vcf_reader = self.get_header() 11409 11410 # Transcripts to info column in JSON 11411 if transcripts_info_json: 11412 11413 # Create column on variants table 11414 self.add_column( 11415 table_name=table_variants, 11416 column_name=transcripts_info_json, 11417 column_type="JSON", 11418 default_value=None, 11419 drop=False, 11420 ) 11421 11422 # Add header 11423 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11424 transcripts_info_json, 11425 ".", 11426 "String", 11427 "Transcripts in JSON format", 11428 "unknwon", 11429 "unknwon", 11430 self.code_type_map["String"], 11431 ) 11432 11433 # Add to update 11434 update_set_json.append( 11435 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11436 ) 11437 11438 # Transcripts to info field in JSON 11439 if transcripts_info_field_json: 11440 11441 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11442 11443 # Add to update 11444 update_set_json.append( 11445 f""" 11446 INFO = concat( 11447 CASE 11448 WHEN INFO NOT IN ('', '.') 11449 THEN INFO 11450 ELSE '' 11451 END, 11452 CASE 11453 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11454 THEN concat( 11455 ';{transcripts_info_field_json}=', 11456 t.{transcripts_info_json} 11457 ) 11458 ELSE '' 11459 END 11460 ) 11461 """ 11462 ) 11463 11464 # Add header 11465 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11466 transcripts_info_field_json, 11467 ".", 11468 "String", 11469 "Transcripts in JSON format", 11470 "unknwon", 11471 "unknwon", 11472 self.code_type_map["String"], 11473 ) 11474 11475 if update_set_json: 11476 11477 # Update query 11478 query_update = f""" 11479 UPDATE {table_variants} 11480 SET {", ".join(update_set_json)} 11481 FROM 11482 ( 11483 SELECT 11484 "#CHROM", POS, REF, ALT, 11485 concat( 11486 '{{', 11487 string_agg( 11488 '"' || "{transcripts_column_id}" || '":' || 11489 to_json(json_output) 11490 ), 11491 '}}' 11492 )::JSON AS {transcripts_info_json} 11493 FROM 11494 ( 11495 SELECT 11496 "#CHROM", POS, REF, ALT, 11497 "{transcripts_column_id}", 11498 to_json( 11499 {{{",".join(clause_to_json)}}} 11500 )::JSON AS json_output 11501 FROM 11502 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11503 WHERE "{transcripts_column_id}" IS NOT NULL 11504 ) 11505 GROUP BY "#CHROM", POS, REF, ALT 11506 ) AS t 11507 WHERE {table_variants}."#CHROM" = t."#CHROM" 11508 AND {table_variants}."POS" = t."POS" 11509 AND {table_variants}."REF" = t."REF" 11510 AND {table_variants}."ALT" = t."ALT" 11511 """ 11512 11513 self.execute_query(query=query_update) 11514 11515 # Transcripts to info column in FORMAT 11516 if transcripts_info_format: 11517 11518 # Create column on variants table 11519 self.add_column( 11520 table_name=table_variants, 11521 column_name=transcripts_info_format, 11522 column_type="VARCHAR", 11523 default_value=None, 11524 drop=False, 11525 ) 11526 11527 # Add header 11528 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11529 transcripts_info_format, 11530 ".", 11531 "String", 11532 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11533 "unknwon", 11534 "unknwon", 11535 self.code_type_map["String"], 11536 ) 11537 11538 # Add to update 11539 update_set_format.append( 11540 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11541 ) 11542 11543 else: 11544 11545 # Set variable for internal queries 11546 transcripts_info_format = "transcripts_info_format" 11547 11548 # Transcripts to info field in JSON 11549 if transcripts_info_field_format: 11550 11551 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11552 11553 # Add to update 11554 update_set_format.append( 11555 f""" 11556 INFO = concat( 11557 CASE 11558 WHEN INFO NOT IN ('', '.') 11559 THEN INFO 11560 ELSE '' 11561 END, 11562 CASE 11563 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11564 THEN concat( 11565 ';{transcripts_info_field_format}=', 11566 t.{transcripts_info_format} 11567 ) 11568 ELSE '' 11569 END 11570 ) 11571 """ 11572 ) 11573 11574 # Add header 11575 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11576 transcripts_info_field_format, 11577 ".", 11578 "String", 11579 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11580 "unknwon", 11581 "unknwon", 11582 self.code_type_map["String"], 11583 ) 11584 11585 if update_set_format: 11586 11587 # Update query 11588 query_update = f""" 11589 UPDATE {table_variants} 11590 SET {", ".join(update_set_format)} 11591 FROM 11592 ( 11593 SELECT 11594 "#CHROM", POS, REF, ALT, 11595 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11596 FROM 11597 ( 11598 SELECT 11599 "#CHROM", POS, REF, ALT, 11600 "{transcripts_column_id}", 11601 concat( 11602 "{transcripts_column_id}", 11603 '|', 11604 {", '|', ".join(clause_to_format)} 11605 ) AS {transcripts_info_format} 11606 FROM 11607 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11608 ) 11609 GROUP BY "#CHROM", POS, REF, ALT 11610 ) AS t 11611 WHERE {table_variants}."#CHROM" = t."#CHROM" 11612 AND {table_variants}."POS" = t."POS" 11613 AND {table_variants}."REF" = t."REF" 11614 AND {table_variants}."ALT" = t."ALT" 11615 """ 11616 11617 self.execute_query(query=query_update) 11618 11619 return True
The transcript_view_to_variants function updates a variants table with information from
transcripts in JSON format.
Parameters
- transcripts_table: The
transcripts_tableparameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from theparamdictionary or use a default value of "transcripts" - transcripts_column_id: The
transcripts_column_idparameter is used to specify the column in thetranscripts_tablethat contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database - transcripts_info_json: The
transcripts_info_jsonparameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts - transcripts_info_field_json: The
transcripts_info_field_jsonparameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name - transcripts_info_format: The
transcripts_info_formatparameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table - transcripts_info_field_format: The
transcripts_info_field_formatparameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name - param: The
paramparameter in thetranscript_view_to_variantsmethod is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. Theparamdictionary can be passed as an argument
Returns
The function
transcript_view_to_variantsreturns a boolean value. It returnsTrueif the operation is successful andFalseif certain conditions are not met.